\documentclass[handout]{beamer}
%\documentclass{beamer}

\setbeamertemplate{navigation symbols}{}



\usepackage{amsfonts}
\usepackage{tikz}
\usetikzlibrary{shapes}
\usepackage{amstext}


\usepackage{psfrag}
\usepackage{texnames}
\usepackage[latin1]{inputenc}
\usepackage{amsmath}
\usepackage{amssymb}

\graphicspath{{Figures/}}
\setkeys{Gin}{width=\textwidth}

% \usepackage{url}
% \url
% \path
% \newcommand\email{\begingroup \urlstyle{tt}\Url}

%\newcommand\Var{\mbox{\rm \,Var\,}}
%\newcommand\Cov{\mbox{\rm \,Cov\,}}
\def \IR{\hbox{{\rm I}\kern-.2em\hbox{{\rm R}}}}

%\newcommand{\bmone}{\mbox{\boldmath$1$}}
\newcommand{\ivprime}{\prime \prime \prime \prime}
\newcommand{\newref}{\par \vspace{\baselineskip} \par \noindent}
\newcommand{\bx}{\mbox{\mbox{\boldmath $x$}}}
\newcommand{\btheta}{\mbox{\mbox{\boldmath $\theta$}}}
\newcommand{\bbeta}{\mbox{\mbox{\boldmath $\beta$}}}
\newcommand{\bu}{\mbox{\mbox{\boldmath $u$}}}
\newcommand{\bphi}{\mbox{\mbox{\boldmath $\phi$}}}

\newcommand{\T}{\rm{\scriptstyle{T}}}


\DeclareMathOperator{\E}{E}

% Generic commands

\newcommand{\Expect}[1]{\mathbb{E} \left[{#1}\right]}
\newcommand{\Expects}[2]{\mathbb{E}_{{#1}} \left[{#2}\right]}
\newcommand{\Var}[1]{\mbox{Var} \left[{#1}\right]}
\newcommand{\Vars}[2]{\mbox{Var}_{#1} \left[{#2}\right]}
\newcommand{\Cov}[1]{\mbox{Cov} \left[{#1}\right]}
\newcommand{\Corr}[1]{\mbox{Corr} \left[{#1}\right]}
\newcommand{\Prob}[1]{P \left({#1}\right)}
\newcommand{\Abs}[1]{\left\vert{#1}\right\vert}
\newcommand{\Norm}[1]{\left\vert\left\vert{#1}\right\vert\right\vert}
\newcommand{\Det}[1]{\left\vert\left\vert{#1}\right\vert\right\vert}
\newcommand{\Gam}[1]{\mbox{Gam}\left({#1}\right)}
\newcommand{\Gauss}[1]{\mbox{N}\left({#1}\right)}

\newcommand{\cas}{\stackrel{a.s.}{\longrightarrow}~}
\newcommand{\cid}{\stackrel{D}{\longrightarrow}~}
\newcommand{\cip}{\stackrel{p}{\longrightarrow}~}
\newcommand{\cms}{\stackrel{m.s.}{\longrightarrow}~}

\newcommand{\Space}{\Re^d}
\newcommand{\Spacee}{\mathcal{E}}
% indicator
\newcommand{\ind}[1]{1\hspace{-2.3mm}{1}_{\left\{{#1} \right\}}} 

\newcommand{\bmehat}{\mathbf{\hat{e}}}
\newcommand{\bxhat}{\mathbf{\hat{x}}}
\newcommand{\byhat}{\mathbf{\hat{y}}}
\newcommand{\bYhat}{\mathbf{\hat{Y}}}

\newcommand{\bma}{\mathbf{a}}
\newcommand{\bmA}{\mathbf{A}}
\newcommand{\bmb}{\mathbf{b}}
\newcommand{\bmB}{\mathbf{B}}
\newcommand{\bmc}{\mathbf{c}}
\newcommand{\bmC}{\mathbf{C}}
\newcommand{\bmd}{\mathbf{d}}
\newcommand{\bmD}{\mathbf{D}}
\newcommand{\bme}{\mathbf{e}}
\newcommand{\bmE}{\mathbf{E}}
\newcommand{\bmf}{\mathbf{f}}
\newcommand{\bmF}{\mathbf{F}}
\newcommand{\bmg}{\mathbf{g}}
\newcommand{\bmG}{\mathbf{G}}
\newcommand{\bmh}{\mathbf{h}}
\newcommand{\bmH}{\mathbf{H}}
\newcommand{\bmi}{\mathbf{i}}
\newcommand{\bmI}{\mathbf{I}}
\newcommand{\bmL}{\mathbf{L}}
\newcommand{\bmO}{\mathbf{O}}
\newcommand{\bmr}{\mathbf{r}}
\newcommand{\bmR}{\mathbf{R}}
\newcommand{\bms}{\mathbf{s}}
\newcommand{\bmS}{\mathbf{S}}
\newcommand{\bmT}{\mathbf{T}}
\newcommand{\bmu}{\mathbf{u}}
\newcommand{\bmU}{\mathbf{U}}
\newcommand{\bmV}{\mathbf{V}}
\newcommand{\bmw}{\mathbf{w}}
\newcommand{\bmW}{\mathbf{W}}
\newcommand{\bmx}{\mathbf{x}}
\newcommand{\bmX}{\mathbf{X}}
\newcommand{\bmy}{\mathbf{y}}
\newcommand{\bmY}{\mathbf{Y}}
\newcommand{\bmz}{\mathbf{z}}
\newcommand{\bmZ}{\mathbf{Z}}

\newcommand{\bmzero}{\mathbf{0}}
\newcommand{\bmone}{\mathbf{1}}

\newcommand{\bmbeta}{\mbox{\boldmath$\beta$}}
\newcommand{\bmdelta}{\mbox{\boldmath$\delta$}}
\newcommand{\bmepsilon}{\mbox{\boldmath$\epsilon$}}
\newcommand{\bmvarepsilon}{\mbox{\boldmath$\varepsilon$}}
\newcommand{\bmlambda}{\mbox{\boldmath$\lambda$}}
\newcommand{\bmLambda}{\mbox{\boldmath$\Lambda$}}
\newcommand{\bmmu}{\mbox{\boldmath$\mu$}}
\newcommand{\bmSigma}{\mbox{\boldmath$\Sigma$}}
\newcommand{\bmtheta}{\mbox{\boldmath $\theta$}}
\newcommand{\bmell}{\mbox{\boldmath $\ell$}}
\newcommand{\bmomega}{\mbox{\boldmath $\omega$}}



      
\newcommand{\thickdashed}{( \rule[2pt]{7pt}{2pt} \rule[2pt]{7pt}{2pt} \rule[2pt]{7pt}{2pt} )}  
   

\begin{document}


\begin{frame}
\frametitle{Spatial Modelling}
\begin{itemize}
	\item Patrick Brown
	\item Cancer Care Ontario
	\item 620 University Ave, 12th floor.
	\item {\tt patrick.brown@cancercare.on.ca}

\end{itemize}

With thanks to Paulo Ribeiro (Curitiba) and Chris Sherlock (Lancaster) for provision of course notes!
\end{frame}


\begin{frame}
\frametitle{Motivating example}
\begin{itemize}
	\item John Snow and the Broad Street Pump
	\item Cholera outbreak in Soho, London, in 1854
	\item London had 1.5 million people and no sewage system
	\item The prevailing miasma theory said cholera was spread by bad air.
	\item Snow believed it was transmitted by contaminated water
	\item He talked to local residents, making note of where they lived
\end{itemize}

\end{frame}


\begin{frame}
\frametitle{Locations of Cholera Victims}

\includegraphics[height=4in,width=4in]{Figures/Snow-cholera-map.png}

\end{frame}

\begin{frame}
\frametitle{}
\begin{itemize}
	\item Cholera cases are more likely to occur close to the Broad Street Pump
	\item Snow found microbes in the water
	\item He asked for the handle on the pump to be removed, which stopped the cholera epidemic
	\pause
	\item Actually, the epidemic was declining before the handle was removed.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Spatial epidemiology}
\begin{itemize}
	\item Data are spatially referenced
	\item Two, or more, dimensions
	\item Is there a spatial pattern to disease incidence locations or rates?
	\item Can we quantify the spatial dependence?
	\item Is this a simple extension of time series analysis?
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{Time series analysis?}
\begin{itemize}
	\item A surprisingly complicated extension
	\item There is no natural ordering for spatial data
	\item In time series the present depends only on the past
	\item $Y_t$ depends on $Y_{t-1}$ (and $Y_{t-2}$ and $Y_{t-3}$?)
	\item Continuous time $Y(t)$, $dY(t)/dt$, $d^2 Y(t)$
	\item No such simplifications for spatial processes (or they're not as straightforward)
	\item Spatial models, compared to time series models, are typically
\begin{itemize}
	\item Simpler; 
	\item Computationally more demanding; and
	\item limited in the size of dataset they can handle
\end{itemize}
\end{itemize}

\end{frame}

\begin{frame}
\frametitle{Spatial Statistics}
\begin{itemize}
	\item Geostatistical models
\begin{itemize}
	\item A surface which is defined everywhere on a region.
\end{itemize}
	\item Discrete spatial variation
\begin{itemize}
	\item A surface defined only at discrete points or regions (possibly irregular)
\end{itemize}
	\item Spatial point processes
\begin{itemize}
	\item Data consist of the locations of events
\end{itemize}
\end{itemize}
\end{frame}



\begin{frame}
\frametitle{Discrete Processes }
\begin{itemize}
	\item Artificial lattice: pixel grid or census districts
	\end{itemize}
	
	\begin{columns}
	
	\column{0.5\textwidth}
	
	Elevation of a drainage basin

\includegraphics[width=\textwidth]{Figures/laval_grid.png}	

\column{0.5\textwidth}

Cancer rates in Birmingham electoral wards

\includegraphics[width=\textwidth,trim=0 0 0 50mm,clip=true]{Figures/birmingham.pdf}

\end{columns}

\end{frame}

\begin{frame}
\frametitle{Lattice processes}
\begin{itemize}
	\item Natural lattices: school district boundaries or cell wall boundaries	
\end{itemize}
	\begin{columns}
	
	\column{0.5\textwidth}
	
	Cell images

\includegraphics[width=\textwidth]{Figures/rawCell.jpg}	

\column{0.5\textwidth}

Discrete spatial process

\includegraphics[width=\textwidth]{Figures/finalCell.jpg}

Colours represent cell density

\end{columns}

\end{frame}



\begin{frame}
\frametitle{Point processes}
\begin{itemize}
	\item Lung cancer in Ontario
	\begin{columns}
	\column{0.5\textwidth} Cases 
	
	\includegraphics[width=\textwidth]{Figures/cases.pdf}
	\column{0.5\textwidth} Controls 
	
	\includegraphics[width=\textwidth]{Figures/controls.pdf}
	\end{columns}
\begin{itemize}
	\item Is there spatial variation in cancer incidence?	
\begin{itemize}
	\item More cases near a specific location such as a power plant?
	\item Cases tending to cluster near other cases?
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}



\begin{frame}
\frametitle{Geostatistics}
\begin{itemize}
	\item Rainfall in Parana state, Brazil
	\item Exists everywhere, but is only evaluated at a few points
\end{itemize}

\includegraphics[width=0.7\textwidth,trim=0in 0.3in 0.3in 3in,clip=true]{Figures/parana4.pdf}

\begin{itemize}
	\item Can we interpolate rainfall where we have no measruements?
	\item Can we put standard errors on our predictions?
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{Geostatistics}
\begin{itemize}
	\item Intensity of lung cancer cases in Ontario
	\item Unobserved, estimated by modelling
\end{itemize}
\includegraphics[width=\textwidth]{Figures/cancerIntensity.png}

\end{frame}


\begin{frame}
\frametitle{This Course}
\begin{itemize}
	\item Geostatistics --- 6 weeks
	\item Point Processes --- 3 weeks
	\item Discrete spatial variation --- 1 week
	\item Markov Random fields ???
	\item Spatio-temporal models ?? 
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Labs}
\begin{itemize}
\item 1 hour in 256 McCaul, following lectures
	\item Using R and WinBUGS
	\item analyses of spatial datasets
	\item you're encouraged to work on your own computers.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Assessment}
\begin{itemize}
	\item One small project 20\%
	\item One larger project with a presentation 40\%
	\item Exam 40 \%
\end{itemize}
\end{frame}



\begin{frame}
\frametitle{Books}
Main books:
\begin{itemize}
	\item Diggle and Ribeiro (2006) Model-based Geostatistics {\tt amazon.com/dp/0387329072}
	\item Diggle (2003) Statistical Analysis of Spatial Point Patterns {\tt amazon.com/dp/0340740701}
	\end{itemize}
	Other books:
	\begin{itemize}
	\item Moeler and Wagerpetersen ``Statistical inference and simulation for spatial point processes''
 {\tt www.myilibrary.com/browse/open.asp?ID=19973} for a more technical treatment of point process and model fitting
	\item Rue and Held ``Gaussian Markov Random Fields'' {\tt amazon.com/dp/1584884320}, if we do Markov random fields.
\item See also \textsl{http://www.ai-geostats.org}
\end{itemize}

\end{frame} 
\begin{frame}
  
 \frametitle{ Contents}
 
      \begin{enumerate}
     
        
      \item{\uppercase{\bf Introduction }}
        
      \item{\uppercase{\bf Basic Geostatistical Model}}
        
      \item{\uppercase{\bf Exploratory Variogram Analysis }}
        
      \item{\uppercase{\bf Parameter Estimation for Gaussian Models }}
        
      \item{\uppercase{\bf Spatial Prediction for Gaussian Models }}

      \item{\uppercase{\bf Generalised Linear Geostatistical Models }}

      \item{\uppercase{\bf Further Topics, History and Philosophy }}
                
      \end{enumerate}
 


\end{frame} \begin{frame}
  
  \frametitle{ PART I: INTRODUCTION}
      


      \begin{enumerate}
             
        
      \item{\bf Basic Examples of Spatial Data}
        
      \item{\bf A Taxonomy for Spatial Statistics}
        
      \item{\bf Further Examples of Geostatistical Problems}
        
      \item{\bf Characteristic Features of Geostatistical Problems }
        
      \item{\bf Core Geostatistical Problems}
                
      \end{enumerate}
 
  
  
  \end{frame} \begin{frame}

  
    
    \frametitle{Basic Examples of Spatial Data}
 
    
    \begin{itemize}
      
    \item{\bf Campylobacter cases in southern England}
      \end{itemize}      
      
      
      Residential locations of 651 cases of campylobacter reported
      over a one-year period in central southern England.

      
      
      \includegraphics[width=0.8\textwidth,trim=0in 0.3in 0.3in 3in,clip=true]{Figures/campy.pdf}
      
      \end{frame} \begin{frame}


    \frametitle{Cancer rates in administrative regions}
      
      Grey-scale corresponds to estimated variation in relative
      risk of colorectal cancer in the 36 electoral wards of
      the city of Birmingham, UK.
      
     \includegraphics[width=0.7\textwidth,trim=0in 0.3in 0.3in 2in,clip=true]{Figures/birmingham.pdf}
      
      \end{frame} \begin{frame}
      
      
  \frametitle{Rainfall in Paran\'a State, Brasil}
      
      
      
      Rainfall measurements at 143 recording stations.\\
      Average for the May-June period (dry season).
      
      
      
      \includegraphics[width=1\textwidth,trim=0in 0.3in 0.3in 3in,clip=true]{Figures/parana4.pdf}
      
      

          
    
    \end{frame} \begin{frame}
    
    
\frametitle{A Taxonomy of Spatial Statistics}

    
    \begin{enumerate}
                
      
    \item {\bf Spatial point processes}
      

      
      
      {\it Basic structure.} Countable set of points $x_i \in \IR^2$,
      generated stochastically.\\
e.g. cases of campylobacter.
%      \begin{itemize}
%      \item data sometimes converted to apparently discrete
%        spatial variation by
%        aggregation over sub-regions
%      \end{itemize}

    \item{\bf Discrete spatial variation}
      

      
      {\it Basic structure.} $Y_i : i=1,...,n$ .\\
e.g. number of cancer  cases in an electoral region.
      \begin{itemize}
                  
    \item rarely arises naturally
      \item but often useful as a pragmatic strategy
 %     \item models typically defined indirectly from    full\\
%        conditionals, $[Y_i| Y_j, \forall j \neq i]$
      \end{itemize}
      
      
      
      
    \item {\bf Continuous spatial variation}
      
   
      
      {\it Basic structure.} $Y(\bmx) : \bmx \in \IR^2$

      Data $(y_i,\bmx_i) : i=1,...,n$\\
        e.g. rainfall measurements at locations $x_i$.\\
Locations may be:
        \begin{itemize}
        \item non-stochastic (eg lattice to cover observation
          region $A$)
        \item or stochastic, {\it but independent
            of the\\
            process $Y(\bmx)$}
        \end{itemize}
        
      
      

      
    \end{enumerate}
    
    \end{frame} \begin{frame}

        
  \begin{description}  

    \item[Spatial statistics] is the collection of
    statistical
    methods in which spatial locations
    play an explicit role in the analysis of data.
    



    \item[Geostatistics] is that part of spatial statistics concerned
    with data obtained by spatially discrete sampling of a spatially
    continuous process.

\end{description}

Don't confuse the {\it data-format}  with the  {\it underlying process}

    
    
%    {\bf Two strategic issues:}
    
%    \begin{itemize}
%              
      
%    
      
%    \item the choice of model may be influenced by the
%      scientific objectives of the study -- {\it analyse
%        problems, not data}
  
%    \end{itemize}
    

    \end{frame} \begin{frame}
 \frametitle{ Further Examples of Geostatistical Problems}
  \begin{block}{Swiss rainfall data}
   
    
\begin{columns}
\column{0.5\textwidth}
      
      

     
      
      \includegraphics[width=\textwidth,trim=2in 3in 2in 4.5in,clip=true]{Figures/avig1.pdf}
  


    \column{0.5\textwidth}
      
      \begin{itemize}
  \item 
      Locations shown as points with size proportional to
        the value of the observed rainfall.               
      \item  $467$ locations in Switzerland 
    
      \item daily rainfall measurements on 8th of May 1986
        
  
      \end{itemize}
      
      \end{columns}
      \end{block}
   data from: {\it Spatial Interpolation Comparison 97}
\small{\tt http://www.ai$-$geostats.org/resources/famous\_geostats\_data.htm}

      \end{frame} \begin{frame}
      
      
  \frametitle{Calcium and magnesium contents in a soil}
    178 measurements of Calcium and Magnesium contents taken
    on the $0-20 cm$ (and $20-40 cm$) soil layers
      
\begin{columns}
\column{0.5\textwidth}
  
  
      \includegraphics[width=\textwidth,trim=0in 0.3in 0.3in 1in,clip=true]{Figures/ca20.pdf}

      \column{0.5\textwidth}
      \begin{itemize}
        \item fertility maps 
        \item assess effects of soil regime and elevation
        \item joint model for Ca and Mg
      \end{itemize}
\end{columns}

      \end{frame} \begin{frame}


\frametitle{Rongelap Island}
      
      
     \begin{columns}\column{0.4\textwidth} 
      \begin{itemize}
      \item  study of residual contamination,
        following nuclear
        weapons testing programme during 1950's
      \item island evacuated in 1985, is it now safe for re-settlement?
   
      \end{itemize}
                                % map of sampling locations
  \column{0.6\textwidth} 
      \includegraphics[width=1.1\textwidth,trim=0.5in 1.6in 0.8in 2.1in,clip=true]{Figures/rongelap.pdf}
      
\end{columns}
\begin{itemize}
	   \item survey yields noisy measurements $Y_i$ of radioactive
        caesium concentrations
      \item initial grid of locations $x_i$ at 200m spacing
        later 
        supplemented by in-fill squares at 40m spacing.
      \item particular interest in maximum caesium concentration
\end{itemize}
%     \end{frame} \begin{frame}
     
%     {\bf \item Presence of species of moss}
     
%     \begin{itemize}
%       
%     \item response 0/1: presence or absence
%     \item identify factors related to the presence of moss species in trees within the study area
%     \item covarites: diameter, humidity, shadow, coverage, tree status
%     \end{itemize}
     
%     \includegraphics[width=\textwidth]{figs/tord1.ps,height=0.7\paperwidth}}
     
    
   \end{frame}
     
   \begin{frame}
      
    \frametitle{Gambia malaria}
       \begin{itemize}
      \item survey of villages in Gambia
      \item in village $i$, data $Y_{ij}=0/1$ denotes
        absence/presence of malarial parasites in blood
        sample from child $j$
  
      \item interest in effects of covariates, and pattern of residual
        spatial variation
      \end{itemize}   
      \begin{columns}\column{0.5\textwidth}   
   
\begin{itemize}
	    \item village-level covariates: 
        \begin{itemize}
        \item village locations
        \item public health centre in village?
        \item satellite-derived vegetation green-ness index
        \end{itemize}
      \item child-level covariates: 
        \begin{itemize}
        \item age, sex, bed-net use
        \end{itemize}
\end{itemize}
  
                                % map of village locations
  \column{0.5\textwidth}  
      \includegraphics[width=1.1\textwidth,trim=0in 1in 0in 1in,clip=true]{Figures/gambia1.pdf}
    \end{columns}  
      
 

    \end{frame} \begin{frame}

  \frametitle{Characteristic Features of Geostatistical Data}

    
    
    
    \begin{itemize}
      
      
    \item data consist of {\bf responses} $Y_i:=Y(\bmx_i)$ associated
      with {\bf locations} $\bmx_i$
      
    \item in principle, $Y$ could be determined from any
      location $\bmx$ within a continuous spatial region $A$
      
    \item it is reasonable to behave as if $\{Y(\bmx) : \bmx \in A\}$
      is a stochastic process
      
    \item $\bmx_i$ is typically fixed. 
      If the locations $\bmx_i$ are generated by a stochastic
      point process, it is reasonable to behave as if this point
      process is independent of the $Y(\bmx)$ process
      
    \item scientific objectives include prediction of one or
      more functionals of a stochastic \textbf{signal} process 
      $\{S(\bmx) : \bmx \in A\}$
      conditional on observations of the $Y(\bmx)$ process.
      
    \end{itemize}
    
    
   \end{frame} \begin{frame}
  \frametitle{Core Geostatistical Problems}

    
    
    
    \begin{itemize}
      
      
    \item{\bf Design}
      
      \begin{itemize}
        
      \item how many locations?
        
      \item how many measurements?
        
      \item spatial layout of the locations?
        
      \item what to measure at each location?
        
      \end{itemize}
      
    \item{\bf Modelling}
      
      \begin{itemize}
        
      \item probability model for the signal, $[S]$
        
      \item conditional probability model for the measurements,
        $[Y|S]$
        
      \end{itemize}
      
    \item {\bf Estimation}
      
      \begin{itemize}
        
      \item assign values to unknown model parameters
        
      \item make inferences about (functions of) model\\
        parameters
        
      \end{itemize}
      
    \item {\bf Prediction}
      
      \begin{itemize}
        
      \item evaluate $[T|Y]$, the conditional distribution of the
        target given the data
        
      \end{itemize}
      
    \end{itemize}


\end{frame} 

\begin{frame}    
\frametitle{A basic example: elevation data }
   
   \begin{columns}\column{0.5\textwidth}
   \includegraphics[width=0.8\textwidth,trim=2.5in 4in 2.5in 0.5in,clip=true]{Figures/elevation_krige.pdf}
   \column{0.5\textwidth}
      \includegraphics[width=0.8\textwidth,trim=2.5in 1in 3in 7in,clip=true]{Figures/elevation_krige.pdf}
      
 Raw data; kriging (with raw data overlaid); and kriging standard errors. 
 \end{columns}
%   \includegraphics[width=\textwidth]{figs/mj.vario.ps,height=7cm,width=0.9\paperwidth}}
 %  {\large empirical variograms for the original data (left) and 
  %   for \emph{residual} data (right). Solid line indicates a covariance model.}
   
%   
%   \mbox{
%     \centerline{
%       \psfig{figure=figs/mj.ok1.ps,width=0.4\paperwidth}
%       \psfig{figure=figs/mj.ok2.ps,width=0.4\paperwidth}
%     }
 %    \vspace{-0.5in}
%   }
%   {\large Kriging results: predicted values (left) and prediction std. error.}


    
    
  
  \end{frame} \begin{frame}
  \frametitle{PART II: BASIC GEOSTATISTICAL MODEL}
      
      
      

  

    \begin{enumerate}
      
      
    \item{\bf Notation} 
      
    \item{\bf The Signal Process}
      
    \item{\bf The Measurement Process}
      
    \item{\bf The Correlation Function}

    \item{\bf Model Extensions (1)}
      
      
    \end{enumerate}
    

  
  \end{frame} 
\begin{frame}
  \frametitle{Model-Based Geostatistics}
 
    
  \begin{block}{Basic model}

 response = mean effect + signal + noise 

 \end{block}
    

    \begin{block}{Notation}
        
    \begin{itemize}
      
      
    \item $\{\bmx_i : i=1,...,n\}$ is the {\bf sampling design}

      \item $\mu$ or $\mu_i:=\mu(\bmx_i)$ is the {\bf trend} or {\bf
          mean effect}
      
    \item $\{Y(\bmx) : \bmx \in A\}$ is the {\bf measurement process}

    \item $Y_i := Y(\bmx_i)$ a.k.a. the {\bf response}
      
    \item $\{S(\bmx) : \bmx \in A\}$ is the {\bf signal process}
      
    \item $T={\cal F}(S)$ is the {\bf target for prediction}
      
    \item $[S,Y]=[S][Y|S]$ is the {\bf geostatistical model}
      
    \end{itemize}
    
    

    Data consist of pairs  $(y_i,\bmx_i) : i=1,...,n$, possibly with
    covariates measured at each $\bmx_i$.
    \end{block}
    
    \end{frame} \begin{frame}
 \frametitle{The Signal Process}

    
    Model the signal process $S(\bmx)$ as a \textbf{Gaussian random field}
    (GRF), also known as a Gaussian process. Initially assume it is
    \textbf{stationary} and \textbf{isotropic}
      \begin{itemize}
        \item
      A \textbf{stationary} process is one whose probability
      distribution is invariant under translation.\\
      \item
      An \textbf{isotropic} process is one whose probability
      distribution is invariant under rotation.\\
      \end{itemize}
      
      \end{frame} 
      
 \begin{frame}

\frametitle{Stationary, Isotropic}
\begin{columns}
\column{0.5\textwidth}
\includegraphics{G-stnyiso}
\column{0.5\textwidth}

\includegraphics{G-stnyiso2}
\end{columns}
  
\end{frame}

\begin{frame}
\frametitle{Non-stationary}
\begin{columns}
\column{0.5\textwidth}
\includegraphics{G-nonStny}

  
\column{0.5\textwidth}
\includegraphics{G-nonStny2}
\end{columns}
  
\end{frame}


\begin{frame}
\frametitle{Stationary, Anisotropic}
\begin{columns}
\column{0.5\textwidth}

\includegraphics{G-aniso}
\column{0.5\textwidth}

\includegraphics{G-aniso2}
\end{columns}
  
\end{frame}
     

\begin{frame}
\frametitle{Isotropic, Non-stationary}
\includegraphics[width=0.6\textwidth]{G-isononst}
\end{frame}  
  
      
     
      \begin{frame}    
       \frametitle{Distribution}
       
      If $S(\bmx)$ is a stationary isotropic Gaussian process (SGP)
      then for any set of points $\bmx_1,\dots,\bmx_n \in A$
\[
\left[
\begin{array}{c}
S(\bmx_1)\\
.\\
S(\bmx_n)
\end{array}
\right]
~\sim~
N(m \bmone, \sigma^2 \bmR)
\]
where $\bmone$ is a vector of ones and 
\[
R_{ij}=\Corr{S(\bmx_i),S(\bmx_j)}
=\rho(\Norm{\bmx_i-\bmx_j})
\]
for some function $\rho(\cdot)$. Without loss of generality we will
always take $m=0$. 
Clearly at any one point $~\bmx$      
\[
S(\bmx)~\sim~N(0,\sigma^2)
\]

\end{frame} \begin{frame}
  \frametitle{The Measurement Process}

    
      
    \begin{enumerate}
      
    \item the
      conditional distribution of $Y(\bmx_i)$ given $S(\bmx_i)$ 
is $Y_i | s(\bmx_i)~\sim~N(\mu+s(\bmx_i),\tau^2)$; 
    \item $Y_i : i=1,...,n$ are
      mutually independent, conditional on $S(\cdot)$. 
    \end{enumerate}
    
    
    \includegraphics[width=0.7\textwidth,trim=1in 4.1in 1in 3.4in,clip=true]{Figures/simul1d.pdf}
    
Simulated data in 1-D illustrating the elements of the model: data $Y(\bmx_i)$ ($\bullet$), signal $S(\bmx)$ ($\frown$)  and mean $\mu$ (---).
    
    \end{frame} \begin{frame}

  \frametitle{An Equivalent Formulation:}
    
    
    
    $$Y_i =  \mu + S(\bmx_i) + \epsilon_i : i=1,...,n.$$
    
    
    
    where $S(\bmx)$ has mean $0$, and 
$\epsilon_i : i=1,...,n$ are mutually independent, identically
    distributed with $\epsilon_i \sim {\rm N}(0,\tau^2)$.
    
    
    
    The joint distribution of $\bmY$ is multivariate Normal,
    
\[
    \bmY 
:=
\left[
\begin{array}{c}
Y(\bmx_1)\\
.\\
Y(\bmx_n)
\end{array}
\right]
~\sim~ {\rm N}(\mu{\bf 1}, \sigma^2 \bmR +\tau^2 \bmI)
\]
    
    
    
    where:

    $\bmone$ is a vector of $1$'s
        
    $\bmI$ is the $n \times n$ identity matrix
    
    $\bmR$ is the $n \times n$
    matrix with $(i,j)^{th}$ element $\rho(u_{ij})$ where \\
    $u_{ij} := ||\bmx_i - \bmx_j||$, the Euclidean distance between $\bmx_i$ and
    $\bmx_j$.


\begin{center}\textit{Do exercise 1a}    \end{center}
    \end{frame} 
    
    \begin{frame}
 \frametitle{ The Correlation Function}
 
    
   \begin{block}{Positive definiteness}
    
    \begin{itemize}
      
      \item
The variance of some linear combination\\ 
$~~~a_1 S(\bmx_1)+ \dots +
a_nS(\bmx_n)$ is
\[
\sum_{i=1}^n {\sum_{j=1}^n{a_i a_j\Cov{S(\bmx_i),S(\bmx_j)}}}
=
\sigma^2 \sum_{i=1}^n {\sum_{j=1}^n{a_ia_j R_{ij}}}
\]
\item
This must be positive for all possible $a_i \in \Re$ (or possibly
zero).
\item
Not all candidate correlation functions posses this property.
      \end{itemize}
      \end{block}
    \end{frame}
    
    \begin{frame}
\frametitle{Positive Definite Matrices}
\begin{itemize}
	\item $A$ is positive definite if $x' A x > 0$ for all $x$.
	\item A necessary and sufficient condition for positive definiteness is for all the Eigenvalues of $A$ to be positive.
	\item Variance matrices must be positive definite, since if $Y \sim N(0, \Sigma)$ then for a vector $a$ then $\Var{a'Y} = a' \Sigma a$.
	\item For $a'Y$ to have positive variance for all $a$, then $\Sigma$ must be positive defininte.
\end{itemize}
\end{frame}
    
    \begin{frame}
\frametitle{Positive Definite Functions}
\begin{itemize}
	\item $f(x)$ is a function of $x \in \Re^n$.
	\item for any set of points $x_1 \ldots x_m$
	\item create matrix $A_{ij} = f(x_i - x_j)$
	\item If $A$ is always positive definite, then $f$ is a positive definite function
	\item for $f$ to be a spatial variance function it must be positive definite
	\item Otherwise we could find a set of points $u_1 \ldots u_m$ and a vector $b$ such that
	\[
	\Var{b'\begin{pmatrix}Y(u_1) \\ \vdots \\ Y(u_m)\end{pmatrix}}
	\]
	is negative.
\end{itemize}
\end{frame}
    
    \begin{frame}
\frametitle{Characteristics of positive definite functions}
\begin{itemize}
 \item Non-negative, and monotone decreasing.
	\item {\bf Bochner's Theorem} states that all p d functions have positive Fourier transforms.
	\item The Exponential function and the Gaussian density are positive definite.
	\item The positive definite constraint leads us to use a small number parametric families for covariance functions. 
\end{itemize}

\end{frame}
    
     \begin{frame}
    
\frametitle{ Differentiability of Gaussian processes}
    
    \begin{itemize}
      
            
    \item A formal mathematical description
      of the smoothness of a spatial surface $S(\bmx)$ is its
      degree of differentiability.
      
    \item $S(\bmx)$ is {\it mean-square continuous} if, for all $\bmx$,
      \[\Expect{\{S(\bmx+\bmh) - S(\bmx)\}^2} \rightarrow 0 
~\mbox{as}~ \Norm{h}\rightarrow 0\]
      
    \item $S(\bmx)$ is {\it mean square differentiable} if there
      exists a process $S^\prime(\bmx)$ such that, for all $\bmx$,
      \[
      {\rm E}\left[ \left\{
          \frac{S(\bmx+\bmh)-S(\bmx)}{\Norm{\bmh}} - S^\prime(\bmx)\right\}^2\right]
      \rightarrow 0 \mbox{ as } \Norm{h} \rightarrow 0
      \]
      
    \item the mean-square differentiability 
      of $S(\bmx)$ is directly
      linked to the differentiability of its covariance function $\rho(u)$.
      
    \end{itemize}
    \end{frame}
    
    \begin{frame}
    
    
 {\bf Theorem} Let $S(\bmx)$ be a SGP with 
    correlation function $\rho(u) : u \in \IR$. Then:
    \begin{itemize}
      
      
    \item $S(\bmx)$ is mean-square continuous iff $\rho(u)$ is 
      continuous at $u=0$;
    \item $S(\bmx)$ is $k$ times mean-square differentiable iff $\rho(u)$ is
      (at least) $2k$ times differentiable at $u=0$.
    \end{itemize}
    
    \end{frame} 
 
    
    
    \begin{frame}
    
    \frametitle{The Mat\'ern family}
      
      
      The correlation function is given by:
      
      $$\rho(u) = \{2^{\kappa-1} \Gamma(\kappa)\}^{-1} (u/\phi)^\kappa 
      K_\kappa(u/\phi)$$
      
    
      
      \begin{itemize}
        
        
      \item $\kappa$ and $\phi$ are parameters
      \item $K_\kappa(\cdot)$ denotes modified Bessel function of order $\kappa$
       \item valid for $\phi>0$ and $\kappa>0$.
        
   
        
      \item $\kappa=0.5$: {\it exponential} correlation function
      \item $\kappa \rightarrow \infty$: {\it Gaussian} correlation function
        
      \end{itemize}

        
      $S(\bmx)$ is mean-square $m$ times differentiable 
 %       \begin{flushright}
if and only if $\kappa>m~~~~$
%\end{flushright}
      

      \begin{columns}\column{0.5\textwidth}
      
      \includegraphics[width=\textwidth,trim=2.4in 4.8in 2.6in 4.3in,clip=true]{Figures/f3_5.pdf}
      \column{0.5\textwidth}
      Three examples of the Mat\'ern
        correlation function with $\phi=0.2$ and $\kappa=1$ (solid line),
        $\kappa=1.5$ (dashed line) and $\kappa=2$ (dotted line).
      \end{columns}
      \end{frame} 
      
\begin{frame}[fragile]
\frametitle{Simulating the Mat\'ern}
\begin{verbatim}
library(RandomFields)

x <- y <- seq(0, 20, by=1/4) 

f <- GaussRF(x=x, y=y, model="whittlematern", grid=TRUE,
             param=c(mean=0, variance=1, nugget=0, 
             				 scale=1, alpha=2))

image(x, y, f, col=topo.colors(100))
\end{verbatim}

The ``alpha'' parameter is the roughness parameter $\kappa$ in our notation.
\end{frame}
      
      
      
      \begin{frame}
      
\frametitle{The powered exponential family}
      
      
      $$\rho(u) = \exp\{-(u/\phi)^\kappa\}$$
      
    
      
      \begin{itemize}
        
        
      \item defined for $\phi > 0$ and $ 0 < \kappa \leq 2$
      \item $\phi$ and $\kappa$ are parameters
        

        
      \item mean-square continuous (but non-differentiable) \\if $\kappa<2$
      \item mean-square infinitely differentiable if $\kappa=2$
      \item $\rho(u)$ very ill-conditioned when $\kappa=2$
        
   
      \item $\kappa = 1$: {\it exponential} correlation function
      \item $\kappa = 2$: {\it Gaussian} correlation function 
      \end{itemize}
      
      
      {\bf Conclusion:} not as flexible as it looks
      
        \begin{columns}\column{0.5\textwidth}  
      
      \includegraphics[width=\textwidth,trim=2.4in 4.8in 2.6in 4.3in,clip=true]{Figures/f3_3.pdf}
      \column{0.5\textwidth}  
     Three examples of the powered exponential
        correlation function with $\phi=0.2$ and $\kappa=1$ (solid line),
        $\kappa=1.5$ (dashed line) and $\kappa=2$ (dotted line).
      \end{columns}
      \end{frame} \begin{frame}
      
\frametitle{The spherical family}
      
      
      $$\rho(u;\phi) = 
      \left\{ 
        \begin{array}{lcr}
          1 - \frac{3}{2} (u/\phi) + \frac{1}{2} (u/\phi)^3 & : & 0 \leq u \leq \phi \\
          0 & : & u > \phi
        \end{array}
      \right.
      $$
      
      

      \begin{itemize}
      

      \item $\phi >0$ is parameter
      \item finite range
        \item non-differentiable at the origin
        \item Has strange properties in the frequency domain which makes estimation unstable.
	\item Despite the problems it's very widely used.
      \end{itemize}
      
      
      \begin{columns}\column{0.6\textwidth}
      \includegraphics[width=\textwidth,trim=2.4in 4.8in 2.6in 4.3in,clip=true]{Figures/f3_1.pdf}
      
      \column{0.4\textwidth}
  The spherical correlation function with $\phi=0.6$. 
      \end{columns}
      

    
    \end{frame}
    

    

    
    
     \begin{frame}
    
\frametitle{Comparable Simulations (same seed)}
    \begin{tabular}{cc}
    \begin{minipage}{0.5\textwidth}
    \includegraphics[width=\textwidth,trim=1.7in 4.7in 1.8in 4in,clip=true]{Figures/f3_6.pdf}
    \end{minipage}
   &
    \begin{minipage}{0.5\textwidth}
      Mat\'ern\\
    $\phi=0.2$ and $\kappa=0.5$ (---),\\
      $\kappa=1$ ( - - - ) and $\kappa=2$ (\ldots).    \end{minipage}
    \\
    
     \begin{minipage}{0.5\textwidth}
    \includegraphics[width=\textwidth,trim=1.7in 4.7in 1.8in 4in,clip=true]{Figures/f3_4.pdf}
    \end{minipage}&
   \begin{minipage}{0.5\textwidth}
    Powered exponential\\
      $\phi=0.2$ and $\kappa=1$ (---),\\
      $\kappa=1.5$ ( - - - ) and $\kappa=2$ (\ldots).
    \end{minipage}
\\
     \begin{minipage}{0.5\textwidth}
    \includegraphics[width=\textwidth,trim=1.7in 4.7in 1.8in 4in,clip=true]{Figures/f3_2.pdf}
   \end{minipage}& \begin{minipage}{0.5\textwidth}
  Spherical\\ ($\phi=0.6$).
   \end{minipage} \end{tabular} 
    \end{frame} 
    
 
    
    \begin{frame}
  
  \frametitle{Model Extensions (1)}
    
    
\begin{block}{Removal of trends}

\begin{columns}\column{0.5\textwidth}
    Re-examine the elevation data;     
there is evidence for a linear (quadratic?) trend with co-ordinates.


   In general  replace constant $\mu$ with 
    \[
    \mu_i = \mu(\bmx_i) = \bmf(\bmx_i)' \bmbeta = 
\sum_{j=1}^k \beta_j f_j(\bmx_i)
    \]
    \column{0.5\textwidth}
    \includegraphics[width=\textwidth,trim=0.8in 2.5in 1in 1.7in,clip=true]{Figures/elevation_plot.pdf}
    \end{columns}
\end{block}
    So that
\[
Y_i =  \mu_i + S(\bmx_i) + \epsilon_i : i=1,...,n.
\]
where $\Expect{S(\bmx)}=0$; $S(\bmx)$ remains stationary and isotropic.

\end{frame}


 \begin{frame}
    
\frametitle{Covariates}
  
      \begin{itemize}
      

      \item 
$f_1(\bmx)=\bmone$ allows for an overall mean
      \item 
    To incorporate a linear trend in the elevation data, 
$f_2(\bmx)$ and $f_3(\bmx)$ would be 
    the $x$ and $y$ coords of $\bmx$.
        \item 
 In general 
    $f_j(\bmx_i)$ is any measured covariate at $\bmx_i$ (or function of it).\\
      \end{itemize}


NB although the linear trend is only obvious for the y-coord for the
elevation data, in
general we would fit similar trend effects to both coordinates so as
to be independent of the particular axis directions.\\

\textit{Q: how many more parameters would be required for a quadratic trend?}

  


  \end{frame} 
      
  \begin{frame}
    
  
  
  
  
  
 \frametitle{PART III:  Exploratory Variogram Analysis}
      
      
      
  

    \begin{enumerate}
      
      
    \item{\bf The Variogram} 
      
    \item{\bf The Empirical Variogram}

    \item{\bf Monte Carlo Variogram Envelope}
      
    \end{enumerate}
    


 
NB: Assumption that non-spatial exploratory analysis has already been performed.
\end{frame} \begin{frame}

\frametitle{The Variogram}

    
    
    
    
    
The variogram is defined by 

      $$V(\bmx,\bmx^\prime) := \frac{1}{2} \Var{Y(\bmx)-Y(\bmx^\prime)}$$


Let $S$ be an isotropic SGP with 
\[\Expect{S(\bmx)}=0~~~ ,~~~\Var{S(\bmx)}=\sigma^2\]
and correlation function $\rho(u)$.
Let the response be 
\[
Y(\bmx_i) = \mu_i + S(\bmx_i)+ \epsilon_i
\] 
where $\epsilon_i$ is i.i.d. Gaussian noise $\epsilon_i\sim N(0,\tau^2)$.\\


Then, writing $u = \Norm{\bmx-\bmx'}$, the variogram of $Y$ is
\[ 
V(u) = \tau^2 + \sigma^2(1-\rho(u))
\]


\textit{For proof see handout.}

\end{frame} 

\begin{frame}
\frametitle{Interpreting the Variogram}

\begin{tikzpicture}
\begin{scope}[line width=2pt,->]
\draw(0,0) -- (0,5) node[pos=0.5,sloped,above=7pt]{$V(u)$};
\draw(0,0) -- (6,0) node[pos=0.5,sloped,below=7pt]{$u$};
\end{scope}



\begin{scope}[dotted,line width=0.5pt]
\draw (0,1.5) -- (6,1.5) node[left,pos=0]{$\tau^2$};
\draw (0,4.5) -- (6,4.5) node[above,pos=0.5]{total sill} node[left,pos=0]{$\sigma^2+\tau^2$};
\end{scope}

\draw[line width=0.5pt,<->] (0.3,1.5) -- (0.3,4.5) node[left,pos=0.5,sloped,below]{sill};
\draw[line width=0.5pt,<->] (0.3,0.1) -- (0.3,1.5) node[left,pos=0.5,sloped,below]{nugget};


\draw[rounded corners=50pt,line width=1pt] (0,1.5) -- (3,4) -- (6,4.5);

\end{tikzpicture}

%\psfrag{sill}[][]{sill} 
%\psfrag{totalsill}[][]{total sill} 
%\psfrag{nugget}[][]{nugget} 
%\psfrag{range}[][]{range} 
%\psfrag{ts}[][]{$\tau^2$} 
%\psfrag{sspts}[][]{$\sigma^2+\tau^2$}
%\psfrag{u}[][]{$u$} 
%\psfrag{V}[][]{$V$} 
%
%    \includegraphics[width=\textwidth,trim=0.5in 0.5in 0.7in 5.7in, clip=true]{Figures/variog.pdf}
%
%    

\begin{itemize}
      

      \item {\it the nugget variance}: $\tau^2$
      \item {\it the sill}: $\sigma^2 = \Var{S(\bmx)}$
      \item {\it the total sill}: $\tau^2+\sigma^2 = \Var{Y(\bmx)}$
      \item {\it the range}: $\phi$, such that
        $\rho(u;\phi) =  \rho(u/\phi;1)$
      \end{itemize}
 
\end{frame} 

\begin{frame}
\frametitle{Variogram v Correlation}
\begin{itemize}
	\item Why not just use the correlation function?
	\item The Variogram is defined for non-stationary processes
	\item The Variogram is easier to estimate for irregular data
\end{itemize}
\end{frame}

\begin{frame}

 \frametitle{The Empirical Variogram}



    

    \begin{itemize}
      
      
    \item if $Y(\bmx)$ is stationary and isotropic,
      $$V(\bmx,\bmx^\prime)=V(u) = \frac{1}{2} 
\Expect{\{Y(\bmx)-Y(\bmx^\prime)\}^2}$$
      
    \item suggests an empirical estimate of $V(u)$:      
      $$\hat{V}(u) = {\rm average}\{[y(\bmx_i) - y(\bmx_j)]^2\}$$
      
      where each average is taken over all pairs $[y(\bmx_i),y(\bmx_j)]$ such
      that $||\bmx_i-\bmx_j||  \approx u$
      
      
    \item for a process with non-constant mean (covariates), trends
      may be removed as follows:\\

      \begin{itemize}
        
        
      \item let $\hat{\bmbeta}$ be the OLS estimate     
      \item and $\hat{\mu}(\bmx_i)=\bmf(\bmx_i)'\hat{\bmbeta}$ 
      \item define $r_i := Y_i - \hat{\mu}(\bmx_i)$
      \item define $\hat{V}(u) = {\rm average}\{(r_i - r_j)^2\}$,\\
        where each average is taken over all pairs $(r_i,r_j)$
        
      \end{itemize}
      
    \end{itemize}
    
    \end{frame} \begin{frame}
    
   \frametitle{The variogram cloud}
      
      
      \begin{itemize}
        
        
      \item define the quantities
        \begin{eqnarray*} 
          r_i & = &  y_i - \hat{\mu}(\bmx_i)\\
          u_{ij} & = & ||\bmx_i - \bmx_j||\\
          v_{ij} & =  & \frac{(r_i - r_j)^2}{2} \\
        \end{eqnarray*}
        
      \item the {\bf variogram cloud} is a scatterplot of the
        points $(u_{ij},v_{ij})$
        
      \end{itemize}
      
      
      
    \begin{block}{ Example: Swiss rainfall data}
      \begin{columns}\column{0.5\textwidth}

      
      \includegraphics[width=0.6\textwidth,trim=3.9in 0.8in 3in 0.7in,clip=true,angle=90]{sic100_cloud.pdf}
      \column{0.5\textwidth}
      
      
      \begin{itemize}
        
        
      \item under the spatial Gaussian model:
        \begin{itemize}
          
          
        \item $V_{ij} \sim V(u_{ij}) \chi_1^2$
        \item the $v_{ij}$ are correlated
        \end{itemize}
        
      \item the variogram cloud is therefore unstable, both
        pointwise and in its overall shape
      \end{itemize}
      \end{columns}\end{block}
      \end{frame} 
      \begin{frame}
      
 \frametitle{The empirical variogram}
      
      
      
      \begin{itemize}
        
        
      \item derived from the variogram cloud by {\bf averaging
          within bins}: $u-h/2 \leq u_{ij} < u+h/2$
        
        \item forms $k$ bins, each averaging over $n_k$ pairs

      \item removes the first objection to the variogram cloud,
        but not the second
        
      \item is sensitive to mis-specification of $\mu(\bmx)$
        
      \end{itemize}
      
      
      
     \begin{block}{Example: Swiss rainfall data.} 
      
      
      
      \includegraphics[width=0.35\textwidth,trim=3.9in 0.8in 3.1in 0.7in,clip=true,angle=90]{Figures/sic100_vario.pdf}
      
    Empirical binned variogram
    \end{block}
      

      \begin{center}{\it Do exercise 3a (lecturer), b/c (class)}\end{center}
      \end{frame} 
      

        
      
       \begin{frame}

\frametitle{Variograms of raw data and residuals can be very different}
    
    
\begin{block}{Example: Paran\'a rainfall data.} 
     empirical variograms
      of raw data (left-hand panel) and of residuals after linear
      regression on latitude, longitude and altitude (right-hand
      panel).
\begin{columns}\column{0.5\textwidth}
    \includegraphics[width=\textwidth,trim=1.3in 2.8in 1.2in 2.8in,clip=true]{Figures/parana3.pdf}
    
    \column{0.5\textwidth}
  
    
    
    
    
    \begin{itemize}
      

    \item variogram of raw data includes variation due to\\
      large-scale geographical
      trend
    \item variogram of residuals eliminates this source \\
      of variation
    \end{itemize}
\end{columns}
    \end{block}
    
    \end{frame}

     
     
     \begin{frame}
    
\frametitle{ How unstable are empirical variograms? }
    
    \begin{columns}\column{0.5\textwidth}
    
    \includegraphics[width=\textwidth,trim=1.3in 2.8in 1.2in 2.8in,clip=true]{Figures/parana2.pdf}
    
    \column{0.5\textwidth}
    
    \begin{itemize}
      

    \item thick solid line shows true underlying variogram
    \item fine lines show empirical variograms from three\\
      independent simulations of the same model
    \item high autocorrelations amongst $\hat{V}(u)$ for successive 
      values of $u$ imparts misleading smoothness 
    \end{itemize}
    
  \end{columns}

\end{frame} \begin{frame}

  \frametitle{Monte Carlo Variogram Envelope}


A simple test for spatial correlation.
\begin{itemize}
\item
$\bmH_0$: there is no spatial correlation.
\item
Under $H_0$ the relative spatial positions of the data (or residuals)
are irrelevant
\item
Under $H_0$ the data may be permuted and the resulting empirical variogram
should arise from the same underlying distribution of variograms as
the original.
\end{itemize}

\end{frame}

\begin{frame}\frametitle{The Algorithm}

\begin{columns}\column{0.5\textwidth}
Repeat $k$ times
\begin{enumerate}
\item
randomly permute the data
\item
calculate the empirical variogram for this permutation
\end{enumerate}
For each $u$ use the lowest and highest (or $5^{th}$ and $95^{th}$
percentiles) of the simulated $V(u)$'s as envelopes (under $H_0$) 
for the true value $V(u)$.

\column{0.5\textwidth}
\includegraphics[width=\textwidth,trim=2.6in 1.5in 2.4in 1in,clip=true,angle=90]{Figures/s300_variog_mc_env.pdf}

Variogram and envelope for simulated data with
  $\mu=0,\sigma^2=1,\tau^2=0.25$ and $\phi=0.3$.

\end{columns}

\end{frame}   

    \begin{frame}
  
    
  
\frametitle{PART IV:  PARAMETER ESTIMATION  FOR GAUSSIAN MODELS}
      
 
    \begin{enumerate}
      
      
    \item{\bf Maximum Likelihood Estimation }
      
    \item{\bf Model Extensions (2) - Box-Cox } 

      \item{\bf A Case Study: Swiss rainfall data }
      
    \item{\bf Model Extensions (3) - Anisotropy}
      
    \item{\bf Not Covered Here }
      
    \item{\bf Bayesian estimation of parameters}
      
    \end{enumerate}
    


  \end{frame} 
 
  \begin{frame}
  
 \frametitle{Maximum Likelihood Estimation}
   
 \begin{block}{The model}   
    
    \[
Y(\bmx_i) = \mu_i + S(\bmx_i)+ \epsilon_i
\] 

\begin{itemize}
      
\item
{\bf mean }
$\mu_i = \mu(\bmx_i) = \sum_{j=1}^k{\beta_j f_j(\bmx_i)}~$ 
i.e. $\bmmu = \bmF \bmbeta$ 
\item
{\bf SGP }
$S(\bmx)$ with $\Expect{S(\bmx)}=0,~\Var{S(\bmx)}=\sigma^2$ and 
$\Corr{S(\bmx_1),S(\bmx_2)} = \rho_k(\Norm{\bmx_1-\bmx_2};\phi)$
\item
{\bf nugget effect } Gaussian noise 
$\epsilon_i\sim~N(0,\tau^2)$
\end{itemize}
\end{block}
\begin{block}{Joint distibution}
\[
\bmY~\sim~N\left(\bmF \bmbeta, \sigma^2 \bmR + \tau^2\bmI\right)
\]
where $R_{ij}(\phi,\kappa) = \rho(\Norm{\bmx_i-\bmx_j})$. 
\end{block}

\end{frame}
\begin{frame}


\begin{block}{Re-parametrise}
\begin{itemize}
	\item Signal to noise ratio: $\nu^2 := \frac{\tau^2}{\sigma^2}$
\item $\bmR_*(\nu,\phi,\kappa) = \bmR(\phi,\kappa)+ \nu^2 \bmI$
\item $\bmY~\sim~N\left(\bmF \bmbeta, \sigma^2 \bmR_*\right)$
\item In this parametrisation $\sigma$ will drop out of the likelihood.
\end{itemize}
\end{block}
\begin{block}{log-likelihood}
\begin{multline}
\ell(\bmbeta,\sigma^2,\nu,\phi,\kappa) = \\
-\frac{n}{2}\log{ 2 \pi}
-\frac{1}{2}\log \Det{\sigma^2 \bmR_*}
-\frac{1}{2\sigma^2}(\bmy - \bmF \bmbeta)'\bmR_*^{-1}(\bmy-\bmF \bmbeta)
\nonumber
\end{multline}
\end{block}
    \end{frame} 
    
    \begin{frame}
 \frametitle{Profile likelihood}
 
  
\begin{itemize}
\item There is no closed-form analytical solution for the parameters which maximise the likelihood
\item A numerical optimisation routine will have to be used
	\item If we know $\nu$, $\psi$, and $\kappa$ (the spatial correlation parameters) then there is an analytical solution for $\beta$ and $\sigma^2$ as the model is linear.
	\item {\bf The idea}:  use the numerical optimiser on $\nu$, $\psi$, and $\kappa$ , using the analytic expressions for $\beta$ and $\sigma^2$
\end{itemize}
\[
\max_{\beta, \sigma^2, \nu, \phi,
  \kappa}{\ell(\bmbeta,\sigma^2,\nu,\phi,\kappa)}
=
\max_{\nu, \phi,\kappa}{
\left(\max_{\beta, \sigma^2}{\ell(\bmbeta,\sigma^2,\nu,\phi,\kappa)}\right)}
\]

By standard results of linear models (see `Supplementary material'), the log-likelihood
$~\ell(\bmbeta,\sigma^2|\nu^2,\phi,\kappa)~$ is maximised at
\begin{eqnarray*}
\hat{\bmbeta}(\nu,\phi,\kappa) &=& 
\left(\bmF'\bmR_*^{-1}\bmF\right)^{-1}\bmF'\bmR_*^{-1}\bmy\\
\hat{\sigma^2}(\nu,\phi,\kappa) &=&
\frac{1}{n}\left(\bmy - \bmF \hat{\bmbeta} \right)'\bmR_*^{-1}
\left(\bmy - \bmF \hat{\bmbeta} \right)
\end{eqnarray*}

\end{frame}\begin{frame}
\frametitle{Profile Likelihood (cont)}

Inserting the expressions for $\beta$ and $\sigma$ into the likelihood function gives the profile likelihood function
\begin{eqnarray*}
\ell^*(\nu,\phi,\kappa)
&=&
-\frac{n}{2}\log {2\pi} - \frac{1}{2}\log \Det{\hat{\sigma}^2 \bmR_*}-
\frac{n}{2}\\
-2 \ell^*(\nu,\phi,\kappa)+c&=&
n \log \hat{\sigma}^2 + \Det{\bmR_*}
\end{eqnarray*}
where $c$ is a constant.\\
\begin{itemize}
	\item Use a numerical optimiser (such as {\tt optim} in R) to find $\hat{\nu}, \hat{\phi}, \hat{\kappa}$
	
\item Back
substitution gives $\hat{\bmbeta}$ and $\hat{\sigma^2}$.\\

      
\end{itemize}
\end{frame} \begin{frame}

\begin{itemize}
      

      \item any reasonable version of the (linear) spatial Gaussian model
        has at least three parameters
        
\begin{itemize}
	\item A spatial variance parameter, for how close the process stays to the mean;
	\item Observation error variance, to take care of uncorrelated noise; and
	\item A range parameter so that changing between miles and km doesn't affect the model
\end{itemize}
 
      \item You need a lot of data (or contextual
        knowledge)
        to justify estimating more than 
        three parameters
      
    \item the {\bf Mat\'{e}rn} family adds a fourth, roughness, parameter. 
    
\begin{itemize}
	\item Stein (1999)'s book shows, the roughness parameter has a lot of influence on the other parameter's estimates
	\item Smooth surface $\Rightarrow$ low signal to noise ratio
  \item It is recommended to try a small number of discrete $\kappa$ 
       e.g. $\{0.5, 1, 2\}$
       \item The profile likelihood function for $\kappa$ is usually fairly flat, and more precise estimation is usually not warranted.
\end{itemize}

      \end{itemize}

\end{frame} \begin{frame}
\frametitle{Model Extensions (2) - Box-Cox}
 
 
    
    
\begin{itemize}
	\item The Gaussian model might be inappropriate for
    variables with asymmetric distributions.
    \item Log transforms often Normalise positive-valued data with a heavy right tail
    \item Squaring data works with a heavy left tail.  
    \item   The Box-Cox transformation has a parameter $\lambda$ offering a continuous range of transformations. 
    \item Box-Cox is commonly used in linear regression.
    
   \item     Terminology: {\it Gaussian-transformed model}.  
\end{itemize}
\end{frame}

\begin{frame}{Box-Cox (continued)}
    
    
 The model is defined as follows: 
    \begin{itemize}
      

    \item assume a variable $\bmY^* \sim MVN(\bmF \bmbeta,
      \sigma^2 \bmR_*)$
      
    \item the data, denoted $\bmy=(y_1,...,y_n)$, are generated
      by a transformation of the linear Gaussian model 
      \begin{eqnarray*}
        y^*_i = h_{\lambda}(y_i) =\left\{
          \begin{array}{ll}
            \frac{(y_i)^{\lambda} - 1}{\lambda} & \mbox{if $\lambda \neq 0$}\\
            \log(y_i) & \mbox{{if $\lambda = 0$}}
          \end{array}              
        \right.
        \label{boxcox}
      \end{eqnarray*}
    \end{itemize}
    
  
    
    The log-likelihood is:
    

      \begin{eqnarray*}
        \nonumber \ell(\bmbeta,\sigma^2,\nu,\phi,\kappa, \lambda) = c & - &
\frac{n}{2} \log \sigma^2 -\frac{1}{2} \log |R_*| \\
          &+& 
%\hspace{0.5cm} +%
          (h_{\lambda}(\bmy)-\bmF\bmbeta)^\prime \{\sigma^2 \bmR_*\}^{-1} 
(h_{\lambda}(\bmy)-\bmF\bmbeta) \} \\
        & + & (\lambda-1)\sum_{i=1}^{n}
        \log y_i
      \end{eqnarray*}

Here $h_\lambda(\bmy) := (h_\lambda(y_1),\dots,h_\lambda(y_n))'$.    
    \end{frame}
    
     \begin{frame}
    
\frametitle{Notes:}
    

    \begin{itemize}
      \item
        Requires all $y_i>0$. 
        \begin{itemize}
          \item
          if some $y_i=0$ simply through rounding then replace with 
          `imputed'  low values.
          \item
            if some $y_i=0$ because there is a probability mass at $0$
            then the model is strictly inappropriate.
          \end{itemize}

        \item
          Allowing any $\lambda \in \Re$ and simply maximising the
          log-likelihood can lead to difficulties in scientific
          interpretation.
          \begin{itemize}
            \item
              Allow only a small set of interpretable
              values e.g. $\{-1,0,0.5,1\}$.
              \item
                Examine the profile log-likelihood for $\lambda$ to
                choose the most appropriate value.
              \item
                Transform the data then analyse as standard case.
          \end{itemize}
          \end{itemize}
          
          \end{frame}
          \begin{frame}
          \begin{itemize}
          \item
            Optimisation is CPU intensive for large datasets
            \begin{itemize}
              \item
                Most of the information for $\lambda$ is in the
                marginal likelihood (i.e. ignoring spatial variation)
              \item
                Obtain a point estimate by maximising
                \begin{multline}
\ell^*(\bmbeta,\sigma^2,\lambda) = 
c - 
\frac{n}{2} \log \sigma^2\\ - 
\frac{1}{2\sigma^2}
          (h_{\lambda}(\bmy)-\bmF\bmbeta)^\prime (h_{\lambda}(\bmy)-\bmF\bmbeta) \} 
         +  (\lambda-1)\sum_{i=1}^{n}
        \log y_i \nonumber
                \end{multline}
            \end{itemize}
    \end{itemize}
    
    \end{frame} \begin{frame}
\frametitle{ A Case Study: Swiss rainfall data}

\begin{columns}\column{0.5\textwidth}      
    
    \includegraphics[width=\textwidth,trim=2in 3in 2in 4.5in,clip=true]{Figures/avig1.pdf}
    
    Locations of the data points with points size proportional to
      the value of the observed data. Distances are in kilometres.   
\column{0.5\textwidth}
    
    \begin{itemize}
            

    \item  $467$ locations in Switzerland 
      
    \item daily rainfall measurements on 8th of May 1986
      
    \item The data values are integers
      where the unit of measurement is $1/10$ mm
      
    \item $5$ locations
      where the value is equal to zero.
      
    \end{itemize}
    \end{columns}
    \end{frame} \begin{frame}
    
  \frametitle{Swiss rainfall data (cont.)}
  
\begin{columns}\column{0.5\textwidth}   
  
 	MLE of Box-Cox parameter $\lambda$ for different values of the Mat\'ern roughness parameter
 $\kappa$.
 
      \begin{center}
        \begin{tabular}{|r|rr|} \hline
          $\kappa$  &  $\hat{\lambda}(\kappa)$ & $\log\hat{L}$  \\ \hline
          $0.5$ & 0.514 & -2464.246 \\  
          $1$ & 0.508 & -2462.413 \\ 
          $2$ & 0.508 & -2464.160 \\ \hline
        \end{tabular}
      \end{center}



    
    

\column{0.5\textwidth} 
    
     Profile likelihoods for $\lambda$ (--$\cdot$--), with 90\% and 95\% confidence limits (- - -) 
     
    \includegraphics[width=\textwidth,trim=1.3in 4in 1.3in 4.3in,clip=true]{Figures/avig2.pdf}

  $\kappa=0.5$, \hspace{0.3in} $\kappa=1$, \hspace{0.3in} $\kappa=2$. 
     \end{columns} 
    
\begin{itemize}
	\item In all cases $\lambda=0.5$ is within the interval but untransformed and log-transformed cases are not.
\end{itemize}

    
    \end{frame}
    
     \begin{frame}
    
 
\begin{block}{ Parameter estimates with $\lambda=0.5$}

        \begin{tabular}{|r|rrrrr|} \hline
          $ \kappa$ & $\hat{\beta}$ & $\hat{\sigma}^2$ & $\hat{\phi}$ & $\hat{\tau}^2$ & $\log\hat{L}$ \\ \hline
          $0.5$ & 18.36 & 118.82 & 87.97 & 2.48 & -2464.315 \\  
          $1$ & 20.13 & 105.06 & 35.79  & 6.92 & -2462.438 \\ 
          $2$ & 21.36 & 88.58 & 17.73 & 8.72 & -2464.185 \\ \hline 
        \end{tabular}
   \end{block}
   
   \begin{block}{Profile likelihoods with $\kappa=1$ and $\lambda=0.5$}
   
      \includegraphics[width=\textwidth,trim=1.2in 4in 1.3in 4.3in,clip=true]{Figures/avig3.pdf}
      

    \hspace{0.6in}    $\sigma^2$ \hspace{1.3in}$\phi$  \hspace{1.3in} $\tau^2$ 
\end{block}
 
    \end{frame}
    
\begin{frame}

\frametitle{ The log-transformation ($\lambda=0$)}

    
\begin{itemize}
	\item Log Gaussian data tend to have sharp peaks and large shallow troughs.
   \item On the log scale, $Y^*(\bmx) = \log[Y(\bmx)]$.
\[
Y^*(\bmx) = \mu + S(\bmx) + \epsilon = \mu + \sigma^2 Z(\bmx) + \epsilon(x) 
\]
\item On the natural scale, 
 \[
    Y(\bmx) = e^{Y^*(\bmx)} = e^{\mu}  (e^{Z(\bmx)})^{\sigma^2} e^{\epsilon(x)}
\]
The larger $\sigma^2$ the sharper the peaks and softer the
troughs.
\item Remember $\E(Y(x)) = \exp[\E(Y^*(x))]+ \Var{Y^*(x)}/2$.   
\end{itemize}

\end{frame}
\begin{frame}
\frametitle{Simulations of log-Gaussian processes}

\hspace*{1in} $\sigma^2=1$ \hspace{1.7in} $\sigma^2=2$. 

\includegraphics[width=\textwidth,trim=0.8in 4.2in 1.4in 4.0in,clip=true]{Figures/log_gauss.pdf}
      

  
\end{frame} \begin{frame}
\frametitle{ Model Extensions (3) - Anisotropy}

    
    
      
      \begin{itemize}
        
        
      \item Environmental conditions can induce directional effects (wind, soil formation, etc).
        
      \item As a consequence the spatial correlation may vary with the direction.
        
      \item a possible approach: {\it geometric anisotropy}.
      
      \hspace*{-0.5in}\includegraphics[width=\textwidth,trim=1.7in 5in 1.9in 4.4in,clip=true]{f3_7.pdf}

 correlation contours for an isotropic model (left) and two anisotropic models (center and right).
        
      \end{itemize}


      \end{frame} \begin{frame}
      
\frametitle{Geometric Anisotropy}
      
     
     \begin{columns}
     \column{0.4\textwidth} 


{
\psfrag{x1}[][]{$x_1$} 
\psfrag{x2}[][]{$x_2$}
\psfrag{a}[][]{$\psi_A$}
\psfrag{r}[][]{$u$}
\psfrag{rpsr}[][]{$u \psi_R$}
        \includegraphics[trim=0.5in 0in 0.5in 0in,clip=true,width=1.2\textwidth]{anisocor.pdf}
}

\column{0.6\textwidth} 


      \begin{itemize}
            
      \item two more parameters: the {\it anisotropy angle} $\psi_A$ and the
        {\it anisotropy ratio} $\psi_R \ge 1$.
        
      \item transform the true co-ordinates $(x_1,x_2)$ to new
        co-ordinates $(x_1^\prime,x_2^\prime)$ by rotation and
        squashing.
        \end{itemize}
        
                
{\scriptsize\[
      \left[
          \begin{array}{r}
            x_1^\prime \\
            x_2^\prime 
          \end{array}
        \right]  
=
      \left[
          \begin{array}{rr}
            1 & 0 \\
            0 & \frac{1}{\psi_R}
          \end{array}
        \right]  
        \left[
          \begin{array}{rr}
            \cos(\psi_A) & -\sin(\psi_A) \\
            \sin(\psi_A) & \cos(\psi_A)
          \end{array}
        \right]
      \left[
          \begin{array}{r}
            x_1 \\
            x_2 
          \end{array}
        \right]
\] }
\begin{itemize}
\item
Analyse the data with respect to the new co-ordinate system.
\end{itemize}
\end{columns}
\end{frame} 
 

\begin{frame}      
\frametitle{Parameter Estimation}

\begin{block}{Likelihood based methods}
    
    \begin{itemize}
            
    \item Add two parameters (angle and squashing) 
    \item Increases the dimension of the numerical minimisation problem
    \item In practice a lot of data might be needed
    \end{itemize}

\end{block}

\begin{columns}

\column{0.5\textwidth}    
    
  \begin{block}{Variogram based exploration}

    \begin{itemize}
            
    \item Compute variograms for different directions
    
    \item Angle bins, in particular for irregularly distributed data


\item Directional variograms for the Swiss rainfall data.  $\Rightarrow$  
    \end{itemize}
     \end{block}   
    
\column{0.5\textwidth}
    
    \includegraphics[width=\textwidth,trim=1.3in 3.6in 1.3in 2.6in,clip=true]{sicvario4.pdf}


\end{columns}    

    
    \end{frame} \begin{frame}
\frametitle{Not covered here}
      
     \begin{block}{Restricted maximum likelihood (REML)}
          \begin{itemize}
            \item transform the data $\bmY \rightarrow \bmY^*$ so that
              $\bmY^*$ does not depend on $\beta$
            \item estimate $(\sigma^2, \nu, \phi, \kappa)$ by maximum
              likelihood on the transformed data $\bmY^*$
            \item leads to less biassed estimates in small samples
            \item MLE's are 
              sensitive to mis-specification of $\bmF$ (the 
              covariate mode for $\bmmu$)
            \end{itemize}
\end{block}
\end{frame}

\begin{frame}\frametitle{Also not covered}

\begin{block}{Non-stationary random variation?}
        
        \begin{itemize}

          \item
        {\bf Intrinsic} variation a weaker hypothesis than stationarity
        (process has stationary increments, cf random walk model in time series),
        widely used as default model for discrete spatial variation (Besag, York
        and Moli\'{e}, 1991).
        

        \item
        {\bf Spatial deformation} methods (Sampson and Guttorp, 1992) seek to
        achieve stationarity by
        transformation of the geographical space, $x$.
        
        
      \item as always, need to balance increased flexibility of
        general modelling
        assumptions against over-modelling of sparse data, leading to poor
        identifiability of model parameters.
        \end{itemize}
\end{block}
    
%    The REML principle is as follows:
    
%    \begin{itemize}
%      

%    \item under the assumed model for ${\rm E}[Y] = F \beta$, transform the
%      data linearly to $Y^* = A Y$ such that the distribution of $Y^*$ does
%      not depend on $\beta$;
%    \item estimate $\theta = $ by
%      maximum likelihood applied to the transformed data $Y^*$
%    \end{itemize}
    
%    We can always find a suitable matrix $A$ without knowing the
%    true values of $\beta$ or $\theta$, for example
%    $$A = I - F (F^\prime F)^{-1}F^\prime$$
    
%    The REML estimators for $\theta$
%    can be computed by maximising
%    \begin{eqnarray*}
%      L^*(\theta) & \propto & -\frac{1}{2}\left\{ \log |\sigma^2 V| 
%        +\log |F^\prime \{\sigma^2 V\}^{-1} F| \right.\\
%      & &
%      \left.+(y-F\tilde{\beta})^\prime \{\sigma^2 V\}^{-1} (y-F\tilde{\beta)}) \right\},
%    \end{eqnarray*}
    
%    where $\tilde{\beta} = \hat{\beta}(\theta)$.
    
%    \end{frame} \begin{frame}
    
%    {\bf Comments on REML}
%    
    
%    \begin{itemize}
%            

%    \item introduced in the context of variance components estimation in
%      designed experiments (Patterson and Thompson, 1971)
      
%    \item leads to less biased estimators in small samples
      
      
%    \item  $L^*(\theta)$ depends on $F$, 
%      and therefore on a correct specification of the model
%      for $\mu(x)$, 
      
%    \item $L^*(\theta)$ does not depend on the choice of $A$. 
      
%    \item Given the model for $\mu(\cdot)$, the method
%      enjoys the same objectivity as does maximum likelihood
%      estimation.
      
%    \item  widely recommended for geostatistical models. 
      
%    \item REML is more sensitive than ML
%      to mis-specification of the model for $\mu(x)$. 
      
%    \item for designed experiments the mean $\mu(x)$ is usually well specified
      
%    \item however in the spatial setting there is no  sharp distinction between 
%      $\mu(x)$ and $S_c(x)$. 
      
      
%    \end{itemize}
    


\end{frame} 

\begin{frame}
\frametitle{Bayesian Estimation Of Parameters}
 
As before:

\begin{description}
\item[the model:]
$S$ is an SGP with
$\Expect{S(\bmx)}=0,~\Var{S(\bmx)}=\sigma^2$, and 
correlation function $\rho(u;\phi)$.
Response is 
\[Y(\bmx_i) = \mu + S(\bmx_i)+ \epsilon_i\]
with i.i.d. Gaussian noise $\epsilon_i\sim N(0,\tau^2)$ ("nugget"). 
\item[reparameterisation:]
\[
\nu^2 :=
\frac{\tau^2}{\sigma^2}
\]
\item[correlation matrix]:
 has elements 
$R_{ij}(\phi) := \rho(\Norm{\bmx_i-\bmx_j};\phi)$. Define 
\[
\bmR_*(\phi,\nu) := \bmR(\phi) + \nu^2\bmI
\]
\end{description}
\end{frame}

\begin{frame}
\frametitle{Bayesian stuff}

A judicious choice of priors yields an convenient posterior

\begin{description}
\item[priors for $\phi$ and $\nu$]
Choose a \textit{discrete} prior for $\phi$ and $\nu$
\[
\pi_{\phi,\nu}(\phi,\nu)
\]
\item[prior for $\sigma^2$ and $\mu$]
Choose continuous priors 
\begin{eqnarray*}
\left(\frac{\sigma^2}{n_\sigma S^2_\sigma} \right)^{-1}| \phi, \nu ~~~&\sim&
\chi^2_{n_\sigma}\\
\mu | \sigma, \phi, \nu &\sim& 
\Gauss{m_\mu, \sigma^2 V_\mu}
\end{eqnarray*}
This is known as a Gaussian-scaled-Inverse-$\chi^2$ distribution on
$(\mu,\sigma^2)$.
\end{description}    

\end{frame} \begin{frame}

\begin{description}

\item[posterior for $(\phi,\nu)$]: a \textit{discrete} posterior with
\begin{equation}
p(\phi,\nu|\bmy)
\propto 
\pi_{\phi,\nu}(\phi,\nu)~\Det{\bmR_*}^{-1/2} V_*^{1/2} \left(S_*^2\right)^{-\frac{1}{2}(n_\sigma+n)}
\label{eqn.norm.inv.chi.discr.post}
\end{equation}

\item[posterior for $(\mu,\sigma^2)$]: 
A Gaussian-scaled-Inverse-$\chi^2$ posterior distribution for 
$~\mu,\sigma^2|\phi,\nu~$ 
\begin{eqnarray}
\left(\frac{\sigma^2}{(n_\sigma+n)S^2_*} \right)^{-1}| \phi, \nu, \bmy &\sim&
\chi^2_{n_\sigma+n}\label{eqn.norm.inv.chi.cts.post.gam}\\
\mu | \sigma^2, \phi, \nu, \bmy &\sim&
\Gauss{m_*,\sigma^2 V^*}
\label{eqn.norm.inv.chi.cts.post.mu}
\end{eqnarray}
\end{description}
where
\begin{eqnarray*}
V_* &:=& \left(V_\mu^{-1}+\bmone'\bmR_*^{-1}\bmone\right)^{-1}\\
m_* &:=& V^*\left(m_\mu V_\mu^{-1}+\bmone'\bmR_*^{-1}\bmy\right)
\end{eqnarray*}
and
\[
S_*^2 := \frac{1}{n_\sigma+n}\left(
m_\mu^2 V_\mu^{-1} + n_\sigma S^2_\sigma +\bmy'\bmR_*^{-1}\bmy
-m_*^2 V_*^{-1}
\right)
\]

\begin{center}
{\it see handout for proof (the proof is \textbf{not examinable}.)}
\end{center}

\end{frame} 

\begin{frame}
\frametitle{Monte Carlo Algorithm}
\begin{itemize}
	\item Sum the right hand side of (\ref{eqn.norm.inv.chi.discr.post}) over 
the finite number of possible   
values of $(\phi,\nu)$ and divide by this to obtain the posterior 
$p(\phi,\nu | \bmy)$ for
each combination of $(\phi,\nu)$.

\item Simulate directly from the full posterior by the following Monte Carlo algorithm

\begin{itemize}
\item  
simulate $\phi^{(i)}$ and $\nu^{(i)}$ from $p(\phi,\nu |\bmy)$.
\item  
simulate from the posterior for $\sigma^2 |\phi^{(i)}, \nu^{(i)}$ 
 using (\ref{eqn.norm.inv.chi.cts.post.gam}).
\item 
simulate from the posterior for for $\mu | \sigma^{2~{(i)}}, \phi^{(i)}, \nu^{(i)}$ using (\ref{eqn.norm.inv.chi.cts.post.mu}).
\item 
$i \leftarrow i+1$; repeat.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
    \frametitle{A Case Study: Swiss rainfall, 100 data}

    
%    {\bf Profiles and posteriors}
    
Profile Likelihoods    
    
    \includegraphics[width=1.4in,angle=90,clip=true,trim=4.4in 1.7in 3.6in 1.4in ]{Figures/sic100_prof.pdf}
    
    
  Posterior distributions  
    
    \includegraphics[width=1.4in,angle=90,clip=true,trim=4.4in 1.7in 3.6in 1.4in]{Figures/sic100_postunif.pdf}
    
    
\end{frame}

\begin{frame}
\frametitle{Joint posterior}
 
    \includegraphics[width=3.1in,angle=90]{Figures/sic100_2Dunif.pdf}
%      \mbox{\centerline{
%          {\psfig{figure=Figures/f6.5a.ps,width=8cm,height=5cm}}
%          {\psfig{figure=Figures/f6.5b.ps,width=8cm,height=5cm}}
%          }}

Samples and contours
   
    
    \end{frame}


\begin{frame}
\frametitle{Extensions}
\begin{description}
\item[Add covariates]: with $~\bmmu =\bmF \bmbeta$, {\em but how does this affect the posterior?}

\item[Vary all parameters]: $\kappa, \lambda, \psi_R, \psi_A$ are fixed.  In principal
  these could be included in the analysis, with discrete priors.

\item[improper priors]: certain simple improper conjugate priors for
  $\mu$ ({\it flat}) and $\sigma^2$ ({\it reciprocal}) are often chosen
  and still lead to proper posteriors (subject to reparameterisation
 to $\nu$ not $\tau$)
\begin{itemize}
\item 
$~~\pi_\mu(\mu) \propto 1$ (`$V_\mu \rightarrow \infty$')
\item
$~~\pi_\sigma(\sigma^2) \propto 1/\sigma^2$ (`$n_\sigma
\rightarrow 0$'). This is the commonly used Jeffrey's prior.
\item but see note in section on GLSM's. 
\end{itemize}
\end{description}

%    \end{frame} \begin{frame}
%    
%    {\bf Example: Swiss rainfall data}
    
%    
    
%     \includegraphics[width=\textwidth]{Figures/sic100.variofit.ps,height=8.5cm,width=13cm}}

%    {\Large Fitted variograms (100 data), using three different methods of estimation: (a)
%      curve-fitting (thin line); (b) ML (dashed line); (c) posterior mode (thick line).}
    
    \end{frame}
    
\begin{frame}
\frametitle{ML v Bayes}
\begin{block}{Bayes}
\begin{itemize}
	\item Allows for for parameter uncertainty to carry over to predictions
	\item Less damage caused by inclusion of poorly identified parameters
	\item More exact parameter confidence intervals (ML is asymptotic)
	\item Can incorporate prior information 
	\item Bayes is necessary for non-Gaussian responses (more on that later).
	\end{itemize}
	\end{block}
	
	\begin{block}{ML}
	
\begin{itemize}
	\item Not affected by priors
	\item Computationally simpler
\end{itemize}
	\end{block}

\end{frame}


     \begin{frame}
  
    
  
  
  
 \frametitle{ PART V: SPATIAL PREDICTION FOR GAUSSIAN MODELS}
  

    \begin{enumerate}
      
      
    \item{\bf Stochastic Process Prediction }
      
    \item{\bf Prediction under the Gaussian Model}
      
    \item{\bf What does Kriging Actually do to the Data }
      
    \item{\bf Prediction of linear Functionals}
  
  \item{\bf Plug-in Prediction}

  \item{\bf Model Validation and Comparison}
    
    \end{enumerate}
    

  

  
  \end{frame} \begin{frame}
  \frametitle{Stochastic Process Prediction}
  
    
    
    
 \begin{block}{General results for prediction}
    
    
    
    \begin{description}
      
      
    \item[ goal:] predict the realised value of a (scalar) r.v. $T$,
      using  data $\bmy$ a realisation of a (vector) r.v. $\bmY$.
      
    \item[a  predictor] of $T$ is any function of $\bmY$, $\hat{T} = t(\bmY)$
            
    \item[the mean square error (MSE)] of $\hat{T}$ is
      $$ MSE(\hat{T}) = \Expect{(T-\hat{T}(\bmY))^2}$$
      \textit{(expectation over both $T$ and $\bmY$)}

      \item[the  MMSE predictor] minimises the MSE 

    \end{description}
    \end{block}
\end{frame}
\begin{frame}

 \frametitle{Theorem}  
    

    The minimum mean square error predictor of $T$ is
    \[\hat{T} = \Expect{T|\bmY}\]
    at which point 
\[
   \Expect{(T-\hat{T})^2}  =  \Expects{\bmY}{\Var{T|\bmY}} 
\]
    (the prediction variance is an estimate of the MSE) $\Box$ 

\textit{See handout for proof.}
    
    
    
   Also, directly from the second tower property
\[
\Var{T} = \Expects{\bmY}{\Var{T|\bmY}} + \Vars{\bmY}{\Expects{\bmY}{T|\bmY}}
\]    
    Hence $\Expect{(T-\hat{T})^2} \leq \Var{T}$, with equality
    if $T$ and $\bmY$ are independent random variables.
    
    \end{frame} \begin{frame}
    
\frametitle{Comments}
    
    
    
    \begin{itemize}
      
      
    \item We call $\hat{T}$ the {\it least squares predictor} for $T$,
      and $\Var{T|\bmY}$ its {\it prediction variance}
      
    \item $\Var{T} - \Var{T|\bmY}$ measures the contribution
      of the data (exploiting dependence between $T$ and $\bmY$)
      
    \item point prediction and prediction variance are summaries
      
    \item complete answer is the distribution $[T|\bmY]$ 
      
    \item not transformation invariant:  $\hat{T}$ the best predictor
      for $T$ does NOT necessarily imply that $g(\hat{T})$ is the best
      predictor for $g(T)$.
      
    \end{itemize}
    
    
    \end{frame} \begin{frame}
    
  \frametitle{ Prediction Under The Gaussian Model}

    
    
    \begin{itemize}
      
      
    \item  \textbf{assume all the parameters}
      $\bmbeta,\sigma^2,\tau^2,\phi,\kappa$ \textbf{are known}

    \item  assume that the target for prediction is $T= S(\bmx')$      
          
    \item $\hat{T} = \Expect{T|\bmY}$,  $\Var{T|\bmY}$ and $[T|\bmY]$
      can be easily derived from a standard result.
      
    \end{itemize}
    
    
    
            
Under the Gaussian model $Y(\bmx_i)= \mu_i+S(\bmx_i)+\epsilon_i$
\[
    \left [
      \begin{array}{l}
        T \\
        \bmY
      \end{array}
    \right]
\sim
N\left(
\left [
      \begin{array}{l}
        0 \\
        \bmmu
      \end{array}
    \right]
,
\sigma^2    \left [
      \begin{array}{ll}
        1 & \bmr^\prime \\
        \bmr  & \bmR + \nu^2 \bmI
      \end{array}
    \right]
\right)
\]
$\bmmu=\bmF \bmbeta$  and $\bmr$ is a vector with elements  
$r_i = \rho_{\kappa}(||\bmx' - \bmx_i||;\phi) : i=1,\ldots,n$  
Again define $\bmR_* = \bmR + \nu^2 \bmI$    
    
    \end{frame}
    
    \begin{frame}
    
    \frametitle{Conditional Distribution}
    
    Using \textit{background results on partitioning the MVN} with
    $\bmZ_1 = T$ and $\bmZ_2 = \bmY$,
    we find that
    the minimum mean square error predictor for $T=S(\bmx)$ is
    
    \begin{eqnarray}
      \hat{T} &=&  \sigma^2 \bmr^\prime ~(\sigma^2 \bmR_*)^{-1}
      (\bmy - \bmmu)\nonumber\\
      &=&  \bmr^\prime ~(\bmR_*)^{-1}
      (\bmy - \bmmu)
      \label{that}
    \end{eqnarray}
    
    
    with prediction variance
    
    \begin{eqnarray}
      \Var{T|\bmY} &=& 
\sigma^2 - \sigma^2 \bmr^\prime~(\sigma^2 \bmR_*)^{-1} \sigma^2 \bmr
\nonumber\\
&=&
\sigma^2 \left(1- \bmr^\prime~(\bmR_*)^{-1}  \bmr \right)
      \label{vart}
    \end{eqnarray}
    
    \end{frame} 
    
    
    \begin{frame}
 \frametitle{Exampe:Swiss rainfall data}
   
    
\begin{columns}
\column{0.5\textwidth}
      
      

     
      
      \includegraphics[width=\textwidth,trim=2in 3in 2in 4.5in,clip=true]{Figures/avig1.pdf}
  


    \column{0.5\textwidth}
      
      \begin{itemize}
  \item 
      Locations shown as points with size proportional to
        the value of the observed rainfall.               
      \item  $467$ locations in Switzerland 
    
      \item daily rainfall measurements on 8th of May 1986
     
     \item $\hat\kappa =1$, $\hat\mu= 20.13$, $\hat\sigma^2 = 105.06$, $\hat\phi = 35.79$, $\hat\tau^2= 6.92   $
  
      \end{itemize}
      
      \end{columns}

      \end{frame}
    
    \begin{frame}

    
    
    
\frametitle{Swiss rainfall data (cont.)}
    
   
\begin{columns}\column{0.5\textwidth}  
    
 \begin{block}{Predictions $\E(S(x) | Y_1 \ldots Y_n)$}
    \includegraphics[width=\textwidth,trim=1.3in 4in 4.2in 4.3in,,clip=true]{Figures/avig6.pdf}
    
    \end{block}
    \column{0.5\textwidth}
    
 \begin{block}{Variances $\Var{S(x) | Y_1 \ldots Y_n} $   
}
    \includegraphics[width=\textwidth,trim=4.2in 4in 1.3in 4.3in,,clip=true]{Figures/avig6.pdf}
    
    \end{block}
    \end{columns}
\end{frame}

    \begin{frame}
\frametitle{Notes}
  \begin{enumerate}
\item  Applies whether $\bmx'$ is a new point or a data point.
    

  \item  Because the conditional variance does not depend on $\bmY$, the
    prediction mean square error is equal to the prediction variance. 
    
    
\item  Equality of prediction mean square error and prediction
    variance (for any $\bmy$) 
    is a special property of the multivariate Gaussian distribution,
    not a general result.
    
    
\item  In conventional geostatistical terminology, construction of
    the surface $\hat{T}=\mu(\bmx) + \hat{S}(\bmx)$ 
    using (\ref{that}) is called 
    {\it kriging}. This name is a reference to D.G. Krige,
    who pioneered the use of statistical methods in the South African
    mining industry (Krige, 1951).
        

  \item  Easy to extend to finding the expectation and joint covariance
    matrix $\bmSigma$ of the signal at a set of points:
    $\bmS_G := [S(\bmx_1'),\dots,S(\bmx_g')]'$ given the data (this is
    a complete specification of the distribution since $\bmS_G \sim MVN$).
\end{enumerate}
    \end{frame} \begin{frame}
   \frametitle{ What Does Kriging Actually Do?}
 
    
The minimum mean square
    error predictor for the mean + signal $~\mu(\bmx') + S(\bmx')~$ is given by
    
    \begin{eqnarray*}
      \hat{T}(\bmx') & = & \mu(\bmx') + \sum_{i=1}^n w_i(\bmx')
      (Y_i-\mu(\bmx_i)) 
    \end{eqnarray*}
    
    
    
    \begin{itemize}
      
      
    \item  the predictor $\hat{T}(\bmx')$ is a compromise
      between the unconditional mean $\mu(\bmx')$ and the deviations
      of the observed
      data $\bmY(\bmx_i)$ from their means $\mu(\bmx_i)$ 
      
    \item  the nature of the compromise depends
      on the target
      location $\bmx';$, the data-locations $\bmx_i$ and the values
      of the model parameters. 
      
    \item the $w_i(\bmx')$ are called the {\it prediction weights}.
      
    \end{itemize}
    
    \end{frame} 
    

    
    \begin{frame}

\frametitle{Effects on predictions}
    
   \begin{block}{Varying the correlation function}
      
      

        \includegraphics[width=\textwidth,clip=true,trim=1.6in 4.6in 1.9in 4.1in]{Figures/f2_5.pdf}
        
        Predictions from 10 equally spaced
          data-points using exponential ( --- )
          or Mat\'{e}rn of order 2 ( - - - ) correlation functions.
          \end{block}
    
    
            \end{frame} \begin{frame}
      \frametitle{Unequally spaced data}
 
        \includegraphics[width=\textwidth,clip=true,trim=1.6in 4.6in 1.9in 4.1in]{Figures/f2_6.pdf}
        
  Predictions from 10 randomly spaced
          data-points using exponential ( --- )
          or Mat\'{e}rn of order 2 ( - - - ) correlation functions.


      
      \end{frame} \begin{frame}
      
\frametitle{Varying the correlation parameter}
      
    

        \includegraphics[width=\textwidth,clip=true,trim=1.6in 4.6in 1.9in 4.1in]{Figures/f2_7.pdf}
        
     Predictions from 10 randomly spaced
          data-points using the Mat\'{e}rn  ($\kappa=2$) correlation function
          and different values of $\phi$: $0.05$ ( --- ), 
          $0.1$ ( - - - ) and $0.5$ \thickdashed.

      
      \end{frame} \begin{frame}
      
\frametitle{Varying the noise-to-signal ratio}
      
      \begin{columns}
      \column{0.8\textwidth}
      \includegraphics[width=\textwidth,clip=true,trim=1.6in 4.6in 1.9in 4.1in]{Figures/f2_8.pdf}

      
      
      
      \includegraphics[width=\textwidth,clip=true,trim=1.6in 4.6in 1.9in 4.1in]{Figures/f2_9.pdf}
       \column{0.25\textwidth}   
  $\Leftarrow \E(S|Y)$
   
   \vspace{20pt}
  
     $\tau^2 = $ 
     
      $0$ ( --- ), 
      
      $0.25$   ( - - - ),  
      
      and 
      
      $0.5$  
      \thickdashed.
  
   \vspace{20pt}
        
  $\Leftarrow \Var{S|Y}$
  
        \end{columns}
    \end{frame}
  
    
%    {\huge 6.2 {\bf  Effects on kriging weights}}
      
%      

%      \begin{enumerate}
        
%      \item{\bf The prediction weights: varying $\phi$}
%        
        
%        \begin{figure}[h]
%          \includegraphics[width=\textwidth]{Figures/weights.phi.ps}}
%          {\Large Prediction weights for 10 equally spaced
%            data-points with target location $x=0.50$.}
%        \end{figure}
        
%        \vspace{0.15in}
        
%        \begin{enumerate}
%          
          
%        \item{{\bf varying parameter}} $\phi = 0, 0.05, 0.15, 0.30$ 
        
%        \item{locations:} equally spaced $x_i = -0.05 + 0.1 i : i=1,...,10$
        
%        \item{prediction location:} $x=0.50$
        
%        \item{correlation function:} Mat\'ern with $\kappa = 2$
        
%        \item{nugget:} $\tau^2 = 0$
        
%        \end{enumerate}
        
%        \end{frame} \begin{frame}
        
%      \item{\bf The prediction weights: varying $\kappa$}
%        
        
%        \begin{figure}[h]
%          \includegraphics[width=\textwidth]{Figures/weights.kappa.ps}}
%          {\Large Prediction weights for 10 equally spaced
%            data-points with target location $x=0.50$.}
%        \end{figure}
        
%        
        
%        \begin{enumerate}
%          
          
%        \item{{\bf varying parameter}} $\kappa= 0.5, 1, 2, 5$ 
        

%        \item{locations:} equally spaced $x_i = -0.05 + 0.1 i : i=1,...,10$
        
%        \item{prediction location:} $x=0.50$
        
%        \item{correlation function:} Mat\'ern with $\phi = 0.1$
        
%        \item{Nugget:} $\tau^2=0$
        
%        \end{enumerate}
        
%        \end{frame} \begin{frame}
        
%      \item{\bf The prediction weights: varying $\tau^2$}
%        
        
%        \begin{figure}[h]
%          \includegraphics[width=\textwidth]{Figures/weights.tausq.ps}}
%          {\Large Prediction weights for 10 equally spaced
%            data-points with target location $x=0.45$.}
%        \end{figure}
        
%        
        
%        \begin{enumerate}
%          
          
%        \item{{\bf varying parameter}} $\tau^2 = 0, 0.1, 0.25, 0.5$ 
          
%        \item{locations:} equally spaced $x_i = -0.05 + 0.1 i : i=1,...,10$
        
%        \item{prediction location:} $x=0.45$
        
%        \item{correlation function:} Mat\'ern with $\kappa = 2$ and $\phi = 0.1$
%        
%        \end{enumerate}
        
%      \end{enumerate}
      
%    \end{frame} \begin{frame}
    
    \begin{frame}
  
   \frametitle{Prediction of Linear Functionals}

    
    
    Let $T$ be any {\it linear} functional of $S^*:=\mu+S$,
    $$T = \int_A c(\bmx) S^*(\bmx) d\bmx$$
    for some prescribed weighting function $c(\bmx)$.  

e.g. the average of $S^*(\cdot)$ over a region,
      $$T=|B|^{-1} \int_B S^*(\bmx)d\bmx$$
      where $|B|$ denotes the area of the region $B$. 
    
    
    \end{frame}
    
    \begin{frame}
    \frametitle{Conditional Distribution}
    
    Under the Gaussian model:
    
      \begin{itemize}
        
        
      \item $[T,\bmY]$  is multivariate Gaussian;
      \item $[T|\bmY]$ is univariate Gaussian;
      \item the conditional mean and variance are:
        
        $$\E(T|\bmY) =  
\int_A c(\bmx)\left(\mu(\bmx)+ \Expect{S(\bmx)|\bmY}\right) d\bmx$$
        
        $$\Var{T|\bmY} = \int_A \int_A c(\bmx) c(\bmx^\prime) 
        \Cov{S(\bmx),S(\bmx^\prime)}~ d\bmx d\bmx^\prime$$
      \end{itemize}
      
      
      
    Note in particular that
    $$\hat{T} = \int_A c(\bmx)(\mu(\bmx)+ \hat{S}(\bmx)) d\bmx$$
    
    \end{frame} 
   
    \begin{frame}
    
    
\frametitle{Explanation in words}

    \begin{itemize}
      

    \item Given a predicted surface $\hat{S}(\bmx)$, it is legitimate
      simply to calculate any linear property of this surface and to use
      the result as the predictor for the corresponding linear property
      of the true surface $S(x)$
    \item Replace the unknown $S$ with the known $\hat S$ in the formula for $T$
    \item it is {\it NOT} legitimate to do this for prediction of
      non-linear properties
    \item for example, the maximum of $\hat{S}(\bmx)$ is a very bad predictor
      for the maximum of $S(\bmx)$
    \end{itemize}
    
  \end{frame}
  
  
   \begin{frame}
\frametitle{Prediction of non-linear Functionals}
    
    
\begin{itemize}
	\item     Let $T$ be {\it any} functional of $S^*:=\mu+S$,     e.g. 
    
\begin{itemize}
	\item the proportion of the area over which $S^*>5$ 
	\item the maximum
    value of $S^*$ over the region. 
    \item When using the Box-Cox transform, predicting $\E(Y)$ rather than $\E[h_\lambda(Y)]$
\end{itemize}

\item Substituting $\hat S$ in the linear case ``worked'' because $\E(aY+b) = a \E(Y) +B)$.
\item This doesn't work for non-linear transforms, for instance $Y \sim N(0,1)$ results in $\E(Y^2) = 1$.
\item The solution is to simulate from the conditional distribution and transform the simulated values.
\end{itemize}

\end{frame}

\begin{frame}
      \frametitle{The algorithm}

\begin{itemize}
      
\item
  Define a prediction grid $G=\{\bmx_1',\dots,\bmx_g'\}$ to cover the area
  of interest
\item
  Dimulate a realisation of $\bmS_G:=[S(\bmx_1'),\dots,S(\bmx_g)]$ from the conditional distribution $[\bmS_G | \bmY ]$
\item
  add on any mean effects $\bmmu_G = \bmF_G~ \bmbeta$ where $\bmF_G$ is
  a matrix of 
  covariates at the points in $G$.
\item
  calculate $t^{(1)}$ from this simulation 
\item
  repeat to obtain $t^{(2)}, \dots, t^{(m)}$ - a sample from the
  distribution of $T$.
\end{itemize}

\end{frame} 

\begin{frame}

\frametitle{How fine should we make the prediction grid?}

\begin{itemize}
\item As fine as your computer will allow and you have the patience for!      
\item
fine enough to pick up all the features
\item
not so fine as to make the computation time and memory requirements prohibitive
\item
pragmatic strategy: stop when the finer of two grids makes no signficant
difference to the quantity of interest (e.g. to posterior mean or median)
\end{itemize}

\end{frame}


\begin{frame}

\begin{columns}
\frametitle{Swiss rainfall data}    
 


\column{0.5\textwidth} 
   
    Prediction of ${\cal F}_{200}(S)$, the percentage of the area where $Y(x)\geq 200$ = 0.4157
    
    
    \includegraphics[width=\textwidth,trim=2.8in 4in 3in 4.4in,clip=true]{Figures/avig7.pdf}
    
\column{0.5\textwidth} 
    
    
 Samples from the predictive distribution of ${\cal F}_{200}(S)$.   
NB  Possible difficulties with negative values and back-transformation
(simply set to zero in geoR code - crude but alternative is
computationally intensive). 
    \end{columns}
    
    \end{frame} 

 \begin{frame}

\frametitle{Plug-In Prediction}

   
\begin{itemize}
	\item  
    Usually the model parameters are in fact \textbf{unknown}. 
    
    
    \item The {\bf plug-in prediction} consists of replacing the true parameters
    by their estimates. 
    
\end{itemize}
        
    \begin{block}{Comments}
    
    
    \begin{itemize}
            

    \item we will use ML estimates 
      
    \item could also use REML estimates 
        
      
    \item The conventional approach to kriging 
      is to plug-in estimated parameter values and proceed 
      {\it as if the estimates were the truth}.
      
      This approach:
      \begin{itemize}
      \item usually gives good point predictions when predicting $T = S(\bmx)$
      \item but often under-estimates prediction variance
      \item and can produce poor results when predicting other targets $T$
      \end{itemize}
      
    \end{itemize}
\end{block}

\end{frame} \begin{frame}
\frametitle{Model Validation and Comparison:}
 
    
\begin{block}{Using validation data}
  
\begin{itemize}
	\item   
    Data divided in two groups: data for model fitting and data for validation
    
  \item  Frequently in practice data are scarce and too expensive to be left out
    
\end{itemize}
\end{block}

\end{frame}

\begin{frame}
\begin{block}{``Leaving-one-out''}
    
    \begin{itemize}
    \item Also called Jackknifing
    \item Write $\bmY_{-i} = \{Y_j ; j \neq i\}$ 

    \item One by one, for each datum:
       
      \begin{enumerate}
        
      \item remove the datum from the data-set
      \item (re-estimate model parameters)
      \item predict at the datum location
      \[
      \E(Y_i | \bmY_{-i}),\ \Var{Y_i | \bmY_{-i}}
      \]
      \item compute standardised residuals
      \[
      \E(Y_i | \bmY_{-i})/ \{ \Var{Y_i | \bmY_{-i}}   \}^{1/2}
      \]
      \end{enumerate}
      
    \item Compare original data with predicted values.
      
    \end{itemize}
                 \end{block}
    \end{frame} 
    
     
\begin{frame}
\frametitle{What to use cross-validation for}
\begin{itemize}
	\item Comparing two models or estimation procedures
	
\begin{itemize}
	\item Compare total sums of squares of prediction errors
\end{itemize}
	\item As a diagnostic, particularly when the dataset is small.
	
\begin{itemize}
	\item Check for Normality
	\item Check for a constant variance
\end{itemize}
\item The {\tt R} function {\tt xvalid} does this
\end{itemize}

\end{frame}     
     
    \begin{frame}
  \frametitle{100 fitting data + 367 validation data}

\begin{columns}
\column{0.5\textwidth}    
$Y_i$ v $\E(Y_i | \bmY_{-i})$

    \includegraphics[width=\textwidth,clip=true,trim=1.1in 9.1in 5in .3in]{Figures/sic_xv_367.pdf}

Underestimating big values

\column{0.5\textwidth}   
QQ plot of standardised residuals

 
    \includegraphics[width=\textwidth,clip=true,trim=5in 9.1in 1.1in .3in]{Figures/sic_xv_367.pdf}

Roughly Gaussian apart from heavy end to the right tail
    
\end{columns}    
    
    \end{frame} 
    
       \begin{frame}
  \frametitle{Standerdised residuals}
   $\E(Y_i | \bmY_{-i})/ \{ \Var{Y_i | \bmY_{-i}}   \}^{1/2}$ v $\E(Y_i | \bmY_{-i})$

\includegraphics[width=0.8\textwidth,clip=true,trim=4.15in 2.4in 0.2in 7.1in]{Figures/sic_xv_367.pdf}
    \end{frame} 
    
  
 \begin{frame}
\frametitle{Bayesian prediction}

    

We wish to make inferences about functional 
$T$ based on the posterior distribution
    
    \begin{eqnarray*}
      [T|Y] &=& \int [T,\theta|Y]d\theta\\
      &=& \int [\theta|Y][T|Y,\theta]  d\theta
    \end{eqnarray*}



This is a weighted sum of the distribution of $T$ given the data and a
particular set of parameter values, taken over all possible parameter
values and using the parameters' posterior distribution as the
weight. 

\end{frame} \begin{frame}

Before describing an efficient algorithm to sample from the posterior, note:
\begin{itemize}
      
\item
Conditional on knowledge of $\bmS:= [S(\bmx_1),\dots,S(\bmx_n)]$ the
signal at all data points...
\item
... the distribution of the signal at a grid of points 
$\bmS_G := [S(\bmx_1'),\dots,S(\bmx_g')]$ depends on $\sigma^2$ and
$\phi$
\item
\textbf{but} does not depend on $\bmbeta$, $\nu = \tau^2/\sigma^2$, or the data
$\bmy$.
\end{itemize}


This is because
\[
    \left [
      \begin{array}{l}
        \bmS_G \\
        \bmS
      \end{array}
    \right]
\sim
N\left(
        \bmzero,
\sigma^2   
\bmR_{dg}
\right)
\]
where the element of $\bmR_{dg}$ corresponding to any two locations
$\bmx^*_1$ and $\bmx^*_2$ is simply $\rho(\Norm{\bmx^*_1-\bmx^*_2}/\phi)$ 
    
\end{frame} \begin{frame}

\frametitle{Predictive sampling algorithm}

Define a predictive grid $G := \{\bmx_1',\dots,\bmx_g'\}$. 

At the $i^{th}$ iteration
\begin{itemize}
      
\item
simulate $\bmbeta^{(i)}, \sigma^{2~(i)}, \phi^{(i)}, \nu^{(i)} $ from
their posterior distribution (as
described under Bayesian parameter estimation in Chapter IV).
\item
simulate the signal at all data
points 
$\bmS^{(i)} = [S(\bmx_1),\dots,S(\bmx_n)]$  using $\bmbeta^{(i)}, \sigma^{2~(i)}, \phi^{(i)}, \nu^{(i)}$
and $\bmy$  
(as described under Prediction, earlier in this chapter).
%\item
%calulate the mean at all data points  
%$\bmmu^{(i)} = [\mu(\bmx_1'),\dots,\mu(\bmx_g')]$ using $\bmbeta^{(i)}$
%and $\bmF$, the covariates matrix at data points.
\item
simulate the signal at all grid points 
$\bmS_G^{(i)} =
[S(\bmx_1'),\dots,S(\bmx_g')]$ using $\bmS^{(i)},\sigma^{2~(i)},\phi^{(i)}$.
\item
calulate the mean at all grid points  
$\bmmu_G = [\mu(\bmx_1'),\dots,\mu(\bmx_g')]$ using $\bmbeta^{(i)}$
and $\bmF_G$, the covariates matrix at predictive grid points.
\item
calculate $t^{(i)}$ from the extended grid of values $(\bmmu^{(i)}+\bmS^{(i)},
\bmmu_G^{(i)}+\bmS_G^{(i)})$ 
- this is a sample from the posterior distribution for $T$.
\item
repeat ...
\end{itemize}

\end{frame}



 \begin{frame}
\frametitle{Notes:}


\begin{itemize}
      
\item
The sampled values from one iteration to the next are
\textit{independent} - this is not MCMC!
\item
Computation of moments of $\bmS_G$ (mean, variance,...) can be performed more
simply as a mixture of multivariate $t$ distributions since some
of the integrals can be computed analytically given
$\phi^{(i)},\nu^{(i)}$: 

Integrate out $\bmbeta, \sigma$ then use knowledge of 
analytical distribution -
\begin{eqnarray*}
\bmF_G \bmbeta + S_G | \bmy, \bmbeta^{(j)}, \sigma^{2{(j)}},
\phi^{(j)}, \nu^{(j)} 
&\sim& \mbox{ m.v. Gaussian} \\
\bmF_G \bmbeta + S_G | \bmy, \phi^{(j)}, \nu^{(j)} ~~~~~~~~~~~~~~~ 
&\sim& \mbox{ m.v. $t$} 
\end{eqnarray*}
\item
Simulation of the signal at data points could also have been used in
the algorithm for
estimating non-linear functionals. 
\end{itemize}

\end{frame} 

\begin{frame}
\frametitle{Comparing plug-in and Bayesian}

    
    \begin{itemize}
            

    \item the plug-in prediction corresponds to inferences about
      $[T|Y, \hat{\theta}]$
      
    \item Bayesian prediction is a weighted average
      of plug-in predictions, with different plug-in
      values of $\theta$ weighted according to their
      conditional probabilities given the observed data.
      
    \end{itemize}
    
    Bayesian prediction is usually  more cautious
    than plug-in prediction, or in other words:
    \begin{itemize}
    \item allowance for parameter uncertainty
      usually results in wider prediction intervals
    \end{itemize}


\end{frame}  \begin{frame}
    
\frametitle{Swiss rainfall: prediction results}
    
    
    \includegraphics[clip=true,trim=0in 3.9in 0in 3.2in]{Figures/sic100_gridunif.pdf}
    
 Predicted signal surfaces and associated
      measures of precision for the rainfall
      data: (a) posterior mean;  (b) posterior variance
    
        \end{frame} \begin{frame}
\begin{columns}
\column{0.7\textwidth}
    \includegraphics[clip=true,trim=1.6in 3.3in 1.8in 2.6in]{Figures/sic100_cut150unif.pdf}
\column{0.3\textwidth}
    

Posterior probability contours for levels 0.10, 0.50 and 0.90
      for the random set $T = \{x : S(x) < 150\}$
    \end{columns}
    \end{frame} \begin{frame}
    
\frametitle{Swiss rainfall: prediction results (cont.)}
    \begin{columns}
\column{0.4\textwidth}
    
    
    \includegraphics[clip=true,trim=1.6in 3.3in 1.8in 2.6in]{Figures/sic100_loc4.pdf}

   
Recording stations and selected prediction locations (1 to 4)   
  \column{0.6\textwidth}
  
     
    \includegraphics[clip=true,trim=0.6in 4in 0.9in 3.1in]{Figures/sic100_pred4unif.pdf}
    
 Bayesian predictive distributions for average rainfall at selected locations.
    

    
  
    \end{columns}
        \end{frame} 
  
  
   \begin{frame}
  \frametitle{PART VI:  GENERALIZED LINEAR SPATIAL MODELS}
      
      
      
    
        \begin{enumerate}
                       

        \item{\bf Non Gaussian data}
        \item{\bf Generalized linear geostatistical models}

        \item{\bf Application of MCMC to Generalized Linear Prediction}
          
        \item{\bf Case-study: Rongelap Island}

        \item{\bf Case-study: Gambia Malaria}

        \end{enumerate}
 
  
  
  
  \end{frame} \begin{frame}
  
\frametitle{Non-Gaussian data} 

\begin{columns}
\column{0.5\textwidth}

\begin{block}{Positive data}


\includegraphics[width=1in,clip=true,trim=5in 2.5in 4.4in 2.3in,angle=90]{Figures/positive.pdf}

\end{block}
\begin{block}{Count data}


\includegraphics[width=1in,clip=true,trim=5in 2.5in 4.4in 2.3in,angle=90]{Figures/countdata.pdf}
\end{block}
\column{0.5\textwidth}

\begin{block}{Binomial data}


\includegraphics[width=1in,clip=true,trim=5in 2.5in 4.4in 2.3in,angle=90]{Figures/bindata.pdf}
\end{block}

\begin{block}{ Positive data with zeros}

\includegraphics[width=1in,clip=true,trim=5in 2.5in 4.4in 2.3in,angle=90]{Figures/positive0.pdf}

\end{block}
\end{columns}

\end{frame}
 \begin{frame}


\frametitle{Towards a model specification }
  
  
  \begin{itemize}
    
  \item Consider the linear model
    \[\bmY = \bmF\bmbeta + \bmvarepsilon,\ \bmvarepsilon \sim N(0, \sigma^2 I) \]
    
  \item Re-write it as 
		\begin{align*}
		 Y_i &\sim N(\mu_i, \sigma^2) \\
		 \mu_i &= \sum_{j=1}^p F_{ij} \beta_j = \bmf_i'\bmbeta 
		 \end{align*}
    
  \item Generalise the model as
		\begin{align*}
Y_i &\sim Q(\mu_i, ...)\\
 h(\mu_i) &= \sum_{j=1}^p F_{ij} \beta_j = \bmf_i'\bmbeta    
 \end{align*}
    
\begin{itemize}
	\item 
     $Q$ is a distribution in the exponential family
    \item  $h(\cdot)$ is a (pre-specified) link function 
    
\end{itemize}

  \item Generalized Linear Models (GLM)
  
    
  \end{itemize}

  \end{frame} \begin{frame}
    
  \frametitle{Generalized Linear Geostatistical Models}

  
  
  
\begin{description} \item[Classical generalized linear model] has
  
  \begin{itemize}
          

  \item $Y_i : i=1,...,n$   mutually independent,
    with $\mu_i={\rm E}[Y_i]$
  \item $h(\mu_i) = \bmf_i'\bmbeta$   
for known link
    function $h(\cdot)$
    
  \end{itemize}
  
  
  
\item[Generalized linear mixed model] has
  
  \begin{itemize}
          

  \item $Y_i : i=1,...,n$   mutually independent, 
    with $\mu_i={\rm E}[Y_i]$,
    conditional on realised values of
    a set of latent random
    variables $U_i$
  \item $h(\mu_i) = \bmf_i' \bmbeta + U_i $   for known link
    function $h(\cdot)$
    
  \end{itemize}
  
  
  
\item[Generalized linear geostatistical  model] has
  
  \begin{itemize}
          

  \item $Y_i : i=1,...,n$   mutually independent,
    with $\mu_i={\rm E}[Y_i]$,
    conditional on realised values of
    a latent spatial stochastic process  $\bmS := [S(\bmx_1),\dots,S(\bmx_n)]$
  \item $h(\mu_i) = \bmf(\bmx_i)' \bmbeta + S(\bmx_i) $  for known link
    function $h(\cdot)$
    
  \end{itemize}
  \end{description}
  \end{frame}
  
  
   \begin{frame}
  
\frametitle{Examples}



$\bmx_1,\ldots,\bmx_n$ locations with observations



\begin{block}{Poisson-log} 
\begin{itemize}
\item $[Y(\bmx_i)\mid S(\bmx_i)]$ is Poisson with density 
\[ 
f(z;\mu)= \exp(-\mu)\mu^z/z!  \ \ z=0,1,2,\ldots
\]
\item link: $E[Y(\bmx_i)\mid S(\bmx_i)] = \mu_i$,  
$\bmf(\bmx_i)'\bmbeta + S(\bmx_i) = \log \mu_i$.
\end{itemize}

   \end{block}

\begin{block}{Binomial-logit} 
\begin{itemize}
\item $[Y(\bmx_i)\mid S(\bmx_i)]$ is binomial with density  
\[
f(z;\mu)= \binom{r}{z}(\mu)^z (1-\mu)^{r-z}  \ \ z=0,1,\ldots,r
\]
\item link: $\E[Y(\bmx_i)\mid S(\bmx_i)] = \mu_i$ , 
$\bmf(\bmx_i)\bmbeta+S(\bmx_i)=\log(\mu_i/(1-\mu_i))$
\end{itemize}

     \end{block}
     \end{frame}
     \begin{frame}

\frametitle{Likelihood function}


\begin{multline}
L(\bmbeta,\sigma^2,\phi)=\\
\int_{\IR^n} \prod_i^n
f(y_i;h^{-1}(\bmf_i'\bmbeta+s_i))f(\bms\mid \sigma^2,\phi)
 ds_1\ldots ds_n
\nonumber
\end{multline}

High-dimensional integral !!!

\end{frame} \begin{frame}

  \frametitle{Inference For The Generalized Linear
      Geostatistical Model}

  
  
  
  \begin{itemize}
          

  \item likelihood evaluation involves high-dimensional  
    numerical
    integration
  \item approximate methods: Breslow and Clayton (1993), Geyer and
    Thompson (1992), Geyer (1994) are of
    uncertain accuracy but useful for exploratory analysis
  \item MCMC is feasible, although not routine.
  \item {\tt geoRglm} and WinBUGS have greatly improved the accessibility of MCMC for spatial models.  
  \end{itemize}
  
  \end{frame}
  
  
   \begin{frame}
  
\frametitle{Application of MCMC}

  
  
  \begin{block}{Start with}
\begin{itemize}
\item
data $y_i=y(\bmx_i)~~~(i=1,\dots,n)$
\item
matrix of covariates at data points $\bmF$ 
\item
(optional) a grid $G := \{\bmx'_1,\dots,\bmx'_g\}$ 
of points at which we wish to sample the signal, and covariate mx $\bmF_G$
\item
covariance model (initially Matern with fixed $\kappa$)
\item
initially assume \textit{no nugget effect} (i.e. $\nu^2=0$)
\end{itemize}  
\end{block}
\begin{block}{We must}
\begin{itemize}

\item
specify priors for  regression parameters $\bmbeta$
      and covariance parameters $\bmtheta:=[\sigma^2,\phi]$ 
\item
choose initial parameter values $\bmbeta^{(0)},\sigma^{(0)},\phi^{(0)}$
\item
choose inital values for the signal at data points $\bmS :=
[S(\bmx_1), \dots, S(\bmx_n)]'$ 
\item
choose an MCMC updating scheme!
\end{itemize}
\end{block}
\end{frame} \begin{frame}

\begin{block}{The goal}
\begin{itemize}
\item
posterior distribution of $\bmbeta, \sigma, \phi$
\item
functionals of the mean + signal at data and/or prediction points
(e.g. mean/max over an area or proportion of region over a certain threshold)
\end{itemize}
\end{block}

\begin{block}{Implementation} 

  \begin{description}
    
    \item[priors] - exactly as for analysis of Gaussian model
    \begin{itemize}
      
      \item
        discrete prior for $\phi$
      \item
        Gaussian-scaled-Inverse-$\chi^2$  for $(\bmbeta,\sigma^2)$
      \end{itemize}
      \item[initial values]
\begin{itemize}
      
  \item
    choose $\phi^{(0)}$ and $\sigma^{(0)}$ from their priors - either
    sensibly from their support or by direct sampling (if priors are
    proper)
    \item
     set $\bmbeta^{(0)}$ by fitting a standard GLM to the data
     \item
       set $s^{(0)}(\bmx_i) = h(y_i) - \bmf(\bmx_i) \bmbeta^{(0)}$
\end{itemize}
\end{description}
\end{block}

\end{frame} 

\begin{frame}
\frametitle{MCMC in a nutshell}
\begin{itemize}
\item We want the conditional distribution $[\beta, \tau, \sigma, \phi | Y]$  
\item Data Augmentation: it's easier if we also simulate from $S$.
\item Bayes theorem:
\begin{align*}
f(\beta, \tau, \sigma, \phi, S | Y) &\propto g_1( Y |\beta, \tau, \sigma, \phi, S ) g_2(S,\beta, \tau, \sigma, \phi )\\
&\propto f_1(Y|S,\beta, \tau) f_2(S|\sigma, \phi) f_3(\beta, \tau, \sigma, \phi )\\
\end{align*}
\item We don't know $f(\beta, \tau, \sigma, \phi, S |Y)$, but we know $c \cdot g(\beta, \tau, \sigma, \phi , S|Y)$.

\item we can simulate from $f$ using $g$, even though we don't know $c$.

\end{itemize}

\end{frame}
\begin{frame}
\frametitle{ Random Walk Metropolis}
\begin{itemize}
\item Write $\theta = [S, \beta, \tau, \sigma, \phi]$.
\item Choose starting values $\theta^{(0)}$.

 \item Markov chain: simulate $\theta^{(i+1)} \sim N(\theta^{(i)}, \gamma^2 I)$ 
 \item draw $u \sim \text{unif} (0, 1)$
	\item Monte Carlo: accept  $\theta^{(i+1)}$ if  $u < f(\theta^{(i+1)})/ f(\theta^{(i)}) $.
\item Repeat a billion times
\item The stationary distribution of the chain is the posterior  $[\theta | Y]$.
\end{itemize}
\end{frame}

\begin{frame}


\frametitle{Convergence}
\begin{columns}

\column{0.5\textwidth}

\begin{itemize}
	\item If $\theta^{(0)}$ is a bad starting value, it may take a lot of iterations before the chain arrives at the stationary distribution.
	\item Burn-in: discard the first half a billion iterations.
\end{itemize} 

\column{0.5\textwidth}
\includegraphics{Figures/G-convergence}

\end{columns}
\end{frame}

\begin{frame}
\frametitle{Dependence}
\begin{columns}\column{0.5\textwidth}
\begin{itemize}
	\item  Each $\theta^{(i+1)}$ depends on its predecessor $\theta^{(i)}$.  
	\item If your MCMC chain is poor, this dependence might be strong.  
	\item Even if the chain is long, it might be a poor approximation to the posterior
	\item Poor mixing: some areas of the posteriors might not be explored, some might be over-represented.
	\item Thinning: retain only one out of every $k$ realisations.
\end{itemize}
\column{0.5\textwidth}
\includegraphics{Figures/G-correlation}
\end{columns}
\end{frame}

\begin{frame}
\frametitle{The Gibbs Sampler}
\begin{itemize}
\item  We need $[\beta , \tau, \sigma, \phi, S | Y],$
	\item Simulate from the full conditionals one at a time.
	\item  $[\beta , \tau| \sigma, \phi, S, Y],  [\sigma ,  \phi |\beta, \tau, S , Y],  [S | \beta, \tau, \sigma, \phi , Y]$
	\item Converges and mixes much more nicely than simple Random Walk Metropolis.
	\item Sometimes we can simulate directly from a distribution
	\item Sometimes use Monte Carlo
	\item Sometimes use Random Walk Metropolis
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Generic MCMC scheme}

\begin{tikzpicture}[fill=blue!20,line width=2pt]
\tikzstyle{every node}=[draw,fill,ellipse,minimum height=1cm,minimum width=1.5cm] 
\path (0,0) node(Y) {$\bmY$}
	 		(2,2) node(ben)  {$\bmbeta,\tau$}
(4,0) node(SS) {$\bmS$}
(6,2) node(sp) {$\sigma,\phi$};
%(8,0) node(sg) {$\bmS_G$};
\draw[blue](Y)  -- (ben) ;
\draw[blue](Y)  -- (SS) ;
%\draw[blue](ben)  -- (SS) ;
\draw[blue](SS) --  (sp);
%\draw[blue](SS) --  (sg);
%\draw[blue](sp) --  (sg);
\end{tikzpicture}


\begin{align*} 
[\beta , \tau| \sigma, \phi, S, Y] &\propto [ Y|\beta , \tau, S, \sigma, \phi][\beta , \tau| S, \sigma, \phi]\\
&\propto [ Y|\beta , \tau, S] [\beta, \tau]
\end{align*}
\begin{itemize}
	\item 
update $\bmbeta^{(i-1)},\tau^{(i-1)}$ to $\bmbeta^{(i)},\tau^{(i)}$ conditional on $~\bmy, \bms^{(i-1)}$
\item Efficient proposal based on Fisher scores.
\end{itemize}
\begin{align*}
[\sigma ,  \phi |\beta, \tau, S , Y] &\propto [\beta, \tau, S , Y|\sigma ,  \phi ]\\
&\propto [Y|S, \beta, \tau] [\beta, \tau][S|\sigma, \phi]\\
\end{align*}
\begin{itemize}
	\item 
update $\sigma^{(i-1)}, \phi^{(i-1)}$ to $\sigma^{(i)},\phi^{(i)}$ conditional on $~\bms^{(i-1)}$
\item $S$ is Gaussian, so distributions are tractable
\end{itemize}

update $\bms^{(i-1)}$ to $\bms^{(i)}$ conditional on 
$~\bmy, \bmbeta^{(i)},\sigma^{(i)},\phi^{(i)}$




\end{frame} 



\begin{frame}
\frametitle{Prediction on the grid}

\begin{tikzpicture}[fill=blue!20,line width=2pt]
\tikzstyle{every node}=[draw,fill,ellipse,minimum height=1cm,minimum width=1.5cm] 
\path (0,0) node(Y) {$\bmY$}
	 		(2,2) node(ben)  {$\bmbeta,\tau$}
(4,0) node(SS) {$\bmS$}
(6,2) node(sp) {$\sigma,\phi$}
(8,0) node(sg) {$\bmS_G$};
\draw[blue](Y)  -- (ben) ;
\draw[blue](Y)  -- (SS) ;
\draw[blue](SS) --  (sp);
\draw[blue](SS) --  (sg);
\draw[blue](sp) --  (sg);
\end{tikzpicture}




\begin{itemize}
\item $[\bms_G | Y] = [\bms_G | \bmS ,\sigma,\phi][\bmS ,\sigma,\phi | Y]$

\item We've got a sample from $[\bmS ,\sigma,\phi | Y]$

\item Sample $\bms_G^{(i)}$ directly from its conditional
distribution given $~\bms^{(i)},\sigma^{(i)},\phi^{(i)}$
\item We didn't simulate from $\bmS$ conditional on $\bmS_G$.
\item Because we're not using Gibbs to simulate from $\bmS_G$.
\end{itemize}



\end{frame} 


\begin{frame}

\frametitle{Prediction}
\begin{itemize}
	\item  Use output from chain to construct
    posterior probability statements about $[T|\bmY]$, where
    $T={\cal F}(\bmS_G,\bmS,\bmbeta)$. 

 \item Two approaches are possible for
    estimating \textbf{expectations} (rather than obtaining full posterior 
distributions). 

\item For simplicity just consider expectations of some
function of the prediction grid.
\end{itemize}
\begin{block}{Full Monte Carlo}
After $m$ iterations approximate 
$\Expect{T(\bmF_G \bmbeta + \bmS_G)|\bmy}$ by 
\[ \frac{1}{m} \sum_{j=1}^m T(\bmF_G \bmbeta^{(j)}+ \bms^{(j)}_G) \]

\end{block}
\end{frame}

\begin{frame}
\begin{block}{Using analytic distributions}
\begin{itemize}
\item Integrate out $\bmbeta, \sigma$ then use knowledge of 
analytical distribution:
\begin{align*}
\bmF_G \bmbeta + S_G | \bms^{(j)}, \bmbeta^{(j)}, \sigma^{2{(j)}}, \phi^{(j)}
&\sim \mbox{ multivariate Gaussian} \\
\bmF_G \bmbeta + S_G | \bms^{(j)}, \phi^{(j)}~~~~~~~~~~~~~~~ 
&\sim \mbox{ multivariate $t$} 
\end{align*}
\item If it is possible to do so, calculate 
$\Expect{T(\bmS_G)|\bms^{(j)},\phi^{(j)}}$, 
$j=1,\ldots,m$ directly, and estimate $\Expect{T(\bmS_G)|\bmy}$ by 
\[ \frac{1}{m} \sum_{j=1}^m \Expect{T(\bmS_G)|\bms^{(j)},\phi^{(j)}}  \]  
\end{itemize}
This is preferable to Monte Carlo as it eliminates 
the portion of the Monte Carlo error due
to sampling $\bmS_G, \bmbeta, \sigma$.

\end{block}
\end{frame} 

\begin{frame}
\frametitle{A more efficient MCMC scheme}

The scheme implemented in \texttt{geoRglm} is documented in Diggle,
Ribeiro and Christensen (2003). 

Note that conditional on $\phi^{(i)}$ the posterior for \[
\bmbeta,\sigma^2~ | ~\bmS
+\bmF\bmbeta
\]
 is Gaussian-scaled-inverse-$\chi^2$ (as with the Gaussian case).

\begin{itemize}

\item
update $\phi^{(i-1)}$ to $\phi^{(i)}$ conditional on $\bms^{(i-1)}$ 
using a random walk Metropolis step
\item
update $\sigma^{(i-1)},\bmbeta^{(i-1)}$ to
$\sigma^{(i)},\bmbeta^{(i)}$ conditional on  $\bmF \bmbeta^{(i-1)} +
\bms^{(i-1)}$ by sampling exactly from the posterior
\item
update $\bms^{(i-1)}$ to $\bms^{(i)}$ conditional on 
$~\bmy, \bmbeta^{(i)},\sigma^{(i)},\phi^{(i)}$ using a truncated
Metropolis adjusted Langevin algorithm (MALA)
\end{itemize}

Also contains a cunning reparameterisation $\bmS \rightarrow \bmZ$
where $\bmS=\sigma
\bmR^{1/2}\bmZ$ makes the
updates to $\bmS$ more efficient.

\end{frame} \begin{frame}

\frametitle{Notes}
\begin{itemize}

\item
The optimal acceptance rate for many high dimensional MALA algorithms is
 $\approx 60\%$ - tune the $\bmS$ scaling parameter to achieve this.
\item
The optimal acceptance rate for many high dimensional RWM algorithms is
 $\approx 23\%$ but this algorithm is one-dimensional so tune the
 $\phi$ scaling parameter to $\approx30\%-40\%$.
\end{itemize}

\end{frame}

\begin{frame}
\frametitle{Extensions}
\begin{itemize}
\item
discrete prior for $\kappa$ e.g. $\kappa=\{0.5,1,1.5,2.5\}$ with
probabilities $\{0.25,0.25,0.25,0.25\}$
\item
random effects (return of the nugget) e.g. $n$ villages and $m_i$
people measured in the $i^{th}$ village. The mean for the $j^{th}$
person in the $i^{th}$ village is given by 
\[
h(\mu_{ij}) = \bmf_{ij} \bmbeta + S(\bmx_i) + Z_i
~~~\mbox{where}~~~Z_i~\sim~N(0,\tau^2)
\]
\begin{itemize}
\item  extra village-level \textit{non-spatial} effect (e.g. missing
covariates). 
\item require a discrete prior on $\nu=\tau/\sigma$.
\end{itemize}
\item
more general priors for $\bmbeta$ and $\sigma$ can be accomodated but
require random walk Metropolis steps for these parameters 
(NB RWM on $log(\sigma)$).
\end{itemize}


\end{frame} \begin{frame}

\frametitle{Improper prior and improper posterior}

\begin{itemize}
	\item In a generalised linear mixed model, the
 improper prior $\pi(\sigma^2) \propto 1/\sigma^2$ leads to an
improper posterior for $\sigma^2$ (Natarajan \& Kass, 2000 - JASA). 

\item Generalised linear geostatistical models are generalised linear mixed
models with a specific covariance structure. Therefore \textit{avoid
  the Jeffrey's prior for $\sigma^2$}.  

\item The Gaussian model with a nugget effect is an example of a
generalised linear mixed model. However in this case (and only in this
case) the reparameterisation $\nu^2 = \tau^2/\sigma^2$ gets round the
mathematical detail and leads to a proper prior for $\sigma^2$. 

\item The whole idea of an improper prior is (arguably) dubious. It is
safer to use diffuse but proper priors. 

\end{itemize}
  
  \end{frame} \begin{frame}
\frametitle{Case-study: Rongelap Island}
  
  
  This case-study illustrates a model-based geostatistical analysis
  combining:
  \begin{itemize}
      

  \item a Poisson log-linear model for the sampling distribution
    of the observations, conditional on a latent 
    Gaussian process
    which represents spatial variation  
    in the level of contamination
  \item Bayesian prediction of non-linear functionals of 
    the latent
    process
  \item MCMC implementation
  \end{itemize}
  
  Details are in  Diggle, Moyeed and
  Tawn (1998).
  
  \end{frame} \begin{frame}
  
\frametitle{Radiological survey of Rongelap Island}
  

    \begin{itemize}
      
    \item approximately 2500 miles south-west of Hawaii
    \item contaminated by nuclear weapons testing in 1954
    \item residents evacuated after the test, many died
    \item 1957 Rongelap declared safe, residents returned.
    \item Leukemia and thyroid-tumors followed.  Greenpeace evacuates residents in 1985
    \item now safe for re-settlement?
    \item Radiation measurements taken, spatial maps made
    \item After some removal of soil, radiation levels have fallen
    \item Reconstruction is underway with resettlement expected soon.
    \end{itemize}
  
  
\end{frame}

\begin{frame}
  
\frametitle{Statistics in Rongelap} 
  
 \begin{block}{The Problem}
    \begin{itemize}
      
    \item field-survey of $^{137}$Cs measurements
    \item estimate spatial variation in $^{137}$Cs radioactivity
    \item compare with agreed safe limits
    \end{itemize}
  
\end{block}
  
 \begin{block}{The model}
  
  \begin{itemize}
    
  \item Basic measurements are net counts $Y_i$ over  
    time-intervals $t_i$
    at locations $\bmx_i$ $(i=1,...,n)$ 
  \item Suggests following model:
    \begin{itemize}
      
    \item   ${S(\bmx) : \bmx \in R^2}$ stationary Gaussian process  
      (local
      radioactivity)
    \item  $Y_i | \{S(\cdot)\} \sim $  Poisson$(\mu_i)$
    \item $\mu_i =t_i \lambda(\bmx_i) = t_i \exp\{ \beta_1
      + S(\bmx_i)\}$.
    \end{itemize}
\end{itemize}
\end{block}
\vspace{-20pt}

\begin{block}{Aims}
      \begin{itemize}
      
    \item predict $\lambda(\bmx)$ over whole island
    \item predict  max $\lambda(\bmx)$
    \item predict  argmax $\lambda(\bmx))$
      \end{itemize}
    
    
  \end{block}
  
  \end{frame}
  
   \begin{frame}
  
\frametitle{Predicted radioactivity surface}

\begin{columns}
\column{0.5\textwidth}
    log-Gaussian kriging
  
 
  
  \includegraphics[clip=true,trim=2.3in 1in 2.4in 1.2in]{gfigure25.pdf}
 
 \column{0.5\textwidth}
 Poisson
    log-linear model with latent Gaussian process
  
  
  
  \includegraphics[clip=true,trim=2.3in 1in 2.4in 1.2in]{gfigure26.pdf}
  \end{columns}
  
  \end{frame}\begin{frame}
  
  \begin{itemize}
      

  \item The two maps above
    show the difference between:
    \begin{itemize}
      

    \item log-Gaussian kriging of observed counts per unit time
    \item log-linear analysis of observed counts
    \end{itemize}
  \item the principal visual difference is in the extent of 
    spatial
    smoothing of the data, which in turn stems 
    from the different treatments
    of the nugget variance
    
  \end{itemize}
  
  \end{frame} \begin{frame}
  
\frametitle{Bayesian prediction of non-linear functionals of
    the radioactivity surface}
  
%  

%  The left-hand panel shows the predictive
%  distribution of maximum radioactivity, contrasting the effects of
%  allowing for (solid line) or ignoring (dotted line) parameter
%  uncertainty; the right-hand panel shows 95\% pointwise credible
%  intervals for the proportion of the island over which radioactivity
%  exceeds a given threshold.
  
  
\begin{columns}\column{0.6\textwidth}  
  
  \includegraphics[angle=90,width=\textwidth]{gfigure27.pdf}

Posterior estimates with 95\%
    point-wise credible intervals for the proportion of the island
    over which radioactivity exceeds a given threshold (dotted line).

  \column{0.5\textwidth}

Posterior distributions from the Poisson model of the
    maximum radioactivity based on: 
\begin{itemize}
	\item    The fully Bayesian analysis incorporating
    the effects of parameter uncertainty in addition to uncertainty
    in the latent process (solid line) 
    \item Fixing the model parameters at their estimated
    values, ie allowing for uncertainty only in the latent process
\end{itemize}

\end{columns}  
  
  \end{frame}
  
  
   \begin{frame}
 \frametitle{Case-study: Gambia malaria}
  
  
  
  \begin{itemize}
      

  \item In this example, the spatial variation is of secondary scientific
    importance.

 \item  The primary scientific interest is to describe how the prevalence
 of malarial parasites depends on
   explanatory variables measured:
 \begin{itemize}
      

    \item on villages
    \item on individual children
 \end{itemize}

 \item There is a particular scientific interest in 
 whether a vegetation index derived from satellite data is
 a useful predictor of malaria prevalence, as this would
 help health workers to decide how to make best use of
 scarce resources.
 \end{itemize}

 

 \end{frame} \begin{frame}

\frametitle{Data-structure}

 

 \begin{itemize}
      

    \item 2039 children in 65 villages
    \item test each child for presence/absence of malaria
     parasites
 \end{itemize}

 

 Covariate information at child level:
 \begin{itemize}
      

     \item age (days)
     \item sex (F/M)
     \item use of mosquito net (none, untreated, treated)
 \end{itemize}

 

 Covariate information at village level:
 \begin{itemize}
      

    \item location
    \item vegetation index, from satellite data
    \item presence/absence of public health centre
 \end{itemize}

 \end{frame} \begin{frame}

\frametitle{ Logistic regression model}


 

 Logistic model for presence/absence in each child:
 \begin{itemize}
      

    \item $Y_{ij} = 0/1$ for absence/presence of malaria parasites in $j$th child in
    $i$th village
    \item $\bmf_{ij} =$ child-specific covariates
    \item $\bmw_i = \bmw(\bmx_i)$ village-specific covariate
    \item ${\rm logit} P(Y_{ij}=1|S(\cdot)) =
          \bmf_{ij}^{\prime} \bmbeta_1 + \bmw_i{^\prime} \bmbeta_2 + S(\bmx_i)$
    \end{itemize}

 

 {\it Is it 
 reasonable to assume conditionally independent infections within
    same village?}

 If not, we might wish to extend the model to allow
 for non-spatial extra-binomial variation:

 \begin{itemize}
      

    \item $U_i \sim {\rm N}(0,\nu^2)$
    \item ${\rm logit} P(Y_{ij}=1|S(\cdot),\bmU) =
      \bmf_{ij}^{\prime} \bmbeta_1 + \bmw_i{^\prime} \bmbeta_2 + 
       U_i + S(\bmx_i)$
    \end{itemize}


\end{frame}
\begin{frame}



\frametitle{Exploratory analysis}

 

 \begin{itemize}
      

    \item fit standard logistic linear model, ignoring $S(\bmx)$  
    and perhaps $U$
    \item compute for each village:
    \begin{description}
      

       \item $N_i = \sum_{j=1}^{n_i} Y_{ij}$
       \item $\mu_i = \sum_{j=1}^{n_i} \hat{P}_{ij}$
       \item $\sigma_i^2 = \sum_{j=1}^{n_i} \hat{P}_{ij}(1-\hat{P}_{ij})$
    \end{description}
    \item compute village-residuals, $r_i = (N_i-\mu_i)/\sigma_i$
    \item apply conventional geostatistics to derived data $r_i$
    \item variogram indicates residual spatial structure
 \end{itemize}



 \end{frame} \begin{frame}

\frametitle{Variogram of residuals}

 


 \includegraphics[width=0.5\textwidth]{Figures/villageresvario.pdf}



 \end{frame} \begin{frame}

\frametitle{Model-based geostatistical analysis}
\begin{columns}
\column{0.35\textwidth}
\begin{block}{Fixed effects}
 $\alpha =$ intercept 

 $\beta_1 =$ age

 $\beta_2 =$  bed-net use

 $\beta_3 =$  treated bed-net

 $\beta_4 =$  green-ness index

 $\beta_5 =$  presence of public health centre in
 village
\end{block}


\begin{block}{Random effects}
 $\nu^2 = \Var{U_i}$, non-spatial

 $\sigma^2 = \Var{S(x)}$, spatial

 $\phi =$ spatial range 

 $\kappa =$  Mat\'{e}rn shape

 \end{block}

\column{0.65\textwidth}
  \begin{center}
    \begin{tabular}{|c|r|r|r|r|} \hline
     &2.5\%
       &
      97.5\% &Mean&Median\\
      \hline
      $\alpha$   &   -4.23  &    1.11  &   -1.66  &   -1.70  \\
      $\beta_1$  &    0.00044  &    0.00092  &    0.00068  &    0.00068  \\
      $\beta_2$  &   -0.68  &   -0.084  &   -0.380  &   -0.39  \\
      $\beta_3$  &   -0.78  &    0.055  &   -0.36  &   -0.36  \\
      $\beta_4$  &   -0.040  &    0.072  &    0.019  &    0.020  \\
      $\beta_5$  &   -0.79  &    0.18  &   -0.32  &   -0.32  \\
      $\nu^2$    &    $2\cdot10^{-6} $ &    0.52  &    0.12  &    0.019  \\
      $\sigma^2$ &    0.24  &    1.66  &    0.79  &    0.74  \\
      $\phi$     &    1.24  &   53.35  &   11.65 &    7.03  \\
      $\kappa$   &    0.15  &    1.96  &    0.94  &    0.83  \\

      \hline
      
    \end{tabular}
  \end{center}    

  \begin{itemize}
  \item note concentration of posterior for $\nu^2$ close to zero
  \end{itemize}
   \end{columns}
  \end{frame} \begin{frame}
  
\frametitle{Posterior mean of $S(x)$}

  
  
  
%  \includegraphics[width=\textwidth]{Figures/newfig2.ps,height=15cm}}
  \includegraphics[width=0.9\textwidth,clip=true,trim=0.3in 1.5in 0.7in 2.1in]{Figures/gambia2.pdf}
  
  \end{frame} \begin{frame}
  
  
\frametitle{Posterior density estimates for $S(\bmx)$
      at two selected locations.}
  
  
  \begin{columns}\column{0.5\textwidth}
  
%  \includegraphics[width=\textwidth]{Figures/figure3.ps,height=15cm}}
    \includegraphics[width=\textwidth,clip=true,trim=0.3in 0.4in 0.7in 1.1in]{Figures/gambia3.pdf}
  \column{0.5\textwidth}
  \begin{description}
    
  \item[---] Remote location (452, 1493) 
  \item[ - - - ] dashed curve -- location (520, 1497),
    close to observed sites in
    central region.
  \end{description}
  \end{columns}
  
  \end{frame} \begin{frame}
  
  
\frametitle{ Empirical posterior distributions for regression parameters}
  
  
  \begin{columns}\column{0.5\textwidth}
  
%  \includegraphics[width=\textwidth]{Figures/figure4.ps,height=15cm}}
  \includegraphics[clip=true,trim=0.3in 0.4in 0.7in 1.1in]{Figures/gambia4.pdf}
    \column{0.5\textwidth}
  \begin{itemize}
    
  \item $\beta_1=$ effect of age
  \item $\beta_2=$ effect of untreated bed-nets
  \item $\beta_3=$ additional effect of treated bed-nets
  \end{itemize}
    \end{columns}
  \end{frame} \begin{frame}
  
\frametitle{Goodness-of-fit for Gambia malaria model}
  
  \begin{columns}\column{0.5\textwidth}
  
  
%  \includegraphics[width=\textwidth]{Figures/figure5.ps,height=15cm}}
  \includegraphics[clip=true,trim=0.3in 0.4in 0.7in 1.1in]{Figures/gambia5.pdf}
  
  \column{0.5\textwidth}
  
Village-level residuals against fitted
  values.
  
   
 (Diggle et al., 2002)
  \begin{itemize}
    
  \item $r_{ij} = (Y_{ij}-\hat{p}_{ij})/\surd\{\hat{p}_{ij}(1-\hat{p}_{ij})\}$
  \item $r_i = \sum r_{ij}/\surd n_i$
  \item intended to check adequacy of model for $p_{ij}$
  \item more sensitive to individual 'unlikely data' than
    $(N_i-\mu_i)/\sigma_i$ which was used in exploratory spatial
    analysis (so perhaps less preferable).
  \end{itemize}
  
    \end{columns}
  \end{frame} \begin{frame}
  
    \begin{columns}\column{0.5\textwidth}
%  \includegraphics[width=\textwidth]{Figures/figure6.ps,height=15cm}}
  \includegraphics[clip=true,trim=0.3in 0.4in 0.7in 1.1in]{Figures/gambia6.pdf}
    \column{0.5\textwidth}
Standardised residual empirical variogram plot
  (village-level data and
  pointwise 95\% posterior intervals constructed from simulated
  realisations of fitted model).
  
   
 
  Simulate realisations from the fitted model and calculate
  \begin{itemize}
    
  \item $r_{ij} = (Y_{ij}-\hat{p}_{ij}^*)/\surd\{\hat{p}_{ij}^*
    (1-\hat{p}_{ij}^*)\}$
  \item $r_i = \sum r_{ij}/\surd n_i$
  \item ${\rm logit} p_{ij}^* = \hat{\alpha} + [\bmf_{ij}',\bmw_i'] \hat{\beta}
    + \hat{S}(\bmx_i)$
  \item intended to check adequacy of model for $S(\bmx)$
  \end{itemize}
  
    \end{columns}
  \end{frame} \begin{frame}
  
\frametitle{Is a geostatistical model necessary?}
  
  
    \begin{columns}\column{0.5\textwidth}
                                %  \includegraphics[width=\textwidth]{Figures/figure7.ps,height=15cm}}
  \includegraphics[clip=true,trim=0.3in 0.4in 0.7in 1.1in]{Figures/gambia7.pdf}
    
    \column{0.5\textwidth}
    
   Plot of estimated posterior means of random
  effects $\hat{U}_i$ from non-spatial GLMM against
  estimated posterior means $\hat{S}(x_i)$ at observed locations
  in geostatistical model.
  
  
  \begin{itemize}
    
  \item high correlation represents strong empirical evidence of spatial
    dependence
  \item but explicit modelling of spatial dependence has small effect
    on inferences about regression parameters
  \end{itemize}
  
    \end{columns}
% section
\end{frame} \begin{frame}
\frametitle{ PART VII:\uppercase { Further Topics, history and philosophy}}
      
      
      
     
        \begin{enumerate}
               
        \item{\bf Sampling design}
        \item{\bf Multivariate methods}
        \item{\bf Space-time models}
        \item{\bf Marked point processes}
        \item{\bf Philosophy and History}
        \item{\bf Closing remarks}  
        \end{enumerate}

 
  

\end{frame} \begin{frame}
\frametitle{Sampling design}


How do we choose the sample points $\bmx_1,\dots, \bmx_n$?

\begin{columns}\column{0.5\textwidth}

\begin{block}{Grid types} 

\textit{Should be stochastically independent of the signal $S(\bmx)$.}

Possibilities include 

\begin{itemize}
\item
\textit{uniform} e.g. a square grid - with the centre positioned at random
\item
\textit{random} - e.g. a Poisson process
\end{itemize}
\end{block}
\column{0.5\textwidth}
  \includegraphics[clip=true,trim=1in 4.4in 1.2in 3.8in]{grid_types.pdf}
  
 Design grids with 100 points 
- regular square lattice (left) and generated by
    a homogenous Poisson process (right).
 
\end{columns}
\end{frame} \begin{frame}
\frametitle{Prediction considerations}



\begin{itemize}

\item
For two points $\bmx_1$ and $\bmx_2$ close together $S(\bmx_1)$ and
$S(\bmx_2)$ will be very similar and so the second point adds little
information about $S(\bmx)$ in that neighbourhood
\item
therefore if prediction of 
a homogenous spatial average is required 
 choose a homogenous regular grid
\item
if certain subregions are more important then sample more heavily in
those subregions
\end{itemize}


\end{frame} \begin{frame}

\frametitle{Parameter considerations}

\begin{columns}\column{0.5\textwidth}
Consider the extreme grid below. 
 
  \includegraphics[clip=true,trim=2.5in 4.1in 2.7in 3.6in]{extreme_grid.pdf}
  
\column{0.5\textwidth}

\begin{itemize}

\item
the difference in $S(\bmx)$ between points close together 
is informative about about $\tau^2$, $\phi$ and any 
 anisotropy parameters. 
\item
for close pairs, 
$S(\bmx_2)$ provides little extra information (over $S(\bmx_1)$) 
about any overall mean $\mu$ or variance $\sigma^2$
\end{itemize}
\end{columns}
  
  \end{frame} 
  
  \begin{frame}
\frametitle{Compromise lattices}
\begin{columns}
\column{0.3\textwidth}
\begin{block}{Lattice plus infill}
\includegraphics{G-latticeFillin.pdf}

\begin{itemize}
	\item Nested sub-lattices
	\item As with Rongelap
\end{itemize}
\end{block}
\column{0.3\textwidth}
\begin{block}{Lattice plus close pairs}
\includegraphics{G-latticeRandom.pdf}
\begin{itemize}
	\item Extra points randomly located within a disk of radius $\delta$ around each lattice point
\end{itemize}
\end{block}
\column{0.4\textwidth}
\begin{itemize}
	\item Infill risks committing too many points to the infill squares, the rest of the grid becomes too sparse
	\item Parameters badly estimated if range is bigger than the infills but smaller than the grid.
\item  Diggle and Lophaven (2006)'s design criteria has close pairs being much better than a simple grid or a grid plus infills.
\end{itemize}
\end{columns}

\end{frame}
  
  
  \begin{frame}
\frametitle{Multivariate methods}

  
  
  
  
\begin{block}{Motivation}
  \begin{itemize}
    
  \item two or more related 
repsonses are measured at each location (e.g. Cancer and Heart Disease cases)
  \item  covariate is missing at some data points
\item $Y_2$ is of no direct interest but is 
correlated with the response of interest $Y_1$, and is much cheaper to measure
\end{itemize}
\end{block}

\begin{block}{Approach}
  \begin{itemize}
    
  \item within linear Gaussian setting, extension to multivariate
    data is straightforward in principle
  \item but specification of a useful class of default models
    for cross-covariance structure is not straightforward - must
   ensure positive definiteness of linear combinations of both/all responses
  \end{itemize}
            \end{block}
\end{frame}
 \begin{frame}

\frametitle{Bivariate model}
  
\[
    \left [
      \begin{array}{l}
        Y_1(\bmx) \\
        Y_2(\bmx)
      \end{array}
    \right]
=
    \left [
      \begin{array}{l}
        \bmf(\bmx) \bmbeta_1\\
        \bmf(\bmx) \bmbeta_2
      \end{array}
    \right]
+
    \left [
      \begin{array}{ll}
        a_{11} & a_{12} \\
        a_{21} & a_{22}
      \end{array}
    \right]
    \left [
      \begin{array}{l}
        S_1(\bmx) \\
        S_2(\bmx)
      \end{array}
    \right]
+
    \left [
      \begin{array}{l}
        U_1 \\
        U_2
      \end{array}
    \right]
\]



Here $S_1(\bmx)$ and $S_2(\bmx)$ are independent SGP's with  
$\mu=0$, $\sigma^2=1$ and positive definite 
correlation functions $\rho(u;\phi_1,\kappa_1)$ and
$\rho(u, \phi_2,\kappa_2)$. Also

\[
    \left [
      \begin{array}{l}
        U_1 \\
        U_2
      \end{array}
    \right]
\sim
N(\bmzero,\bmSigma)
\]



for some covariance matrix $\bmSigma$.



\textbf{NB} Number of parameters increases rapidly with dimension of response -
detailed implementation should use structure of the specific problem.

\end{frame}



\begin{frame}
\frametitle{Shared component model}
\begin{itemize}
	\item Suppose $Y_{1i}$,  $Y_{2i}$, $Y_{ni}$ are death rates from different causes at location $x_i$.
\item Death rates are affected by a common surface $S^*_0$ (perhaps relating to environmental pollution), and separate surfaces $S^*_1(x)$ to $S^*_n(x)$.
\[
S_j(s) = S^*_0(s) + S^*_j(s);\ j=1\ldots n 
\]
\item Dimension of the problem doesn't grow so quickly with $n$
\item Even for $n=2$, perhaps this is a more intuitive interpretation than the bivariate model?
\end{itemize}

\end{frame}


\begin{frame}
\frametitle{Space-time models}

  
  
  
  \begin{itemize}
    

  \item Emerging space-time data-sets are {\bf big}, and present severe
    computational challenges.

  \item Specific models are best defined in context.

      
    \item Calibration of radar reflectance against ground-truth rainfall
      intensity (Brown, Diggle, Lord and Young, 2001).
      \begin{enumerate}
      \item $Y_{it} : i=1,...,n$ -- ground-truth log-rainfall intensity at small
        number of sites $x_i$ 
      \item $U(x,t) : x \in A$ -- log-radar reflectance measured effectively
        continuously over a study region $A$
      \item Empirical model,
        $$Y_{it} = \alpha + B(x_i,t) U(x_i,t)$$
        where $B(x,t)$ is continuous space-time Gaussian process
      \item Spatial prediction,
        $$\hat{Y}(x,t) = \hat{\alpha} + \hat{B}(x,t) U(x,t)$$
      \end{enumerate}
\end{itemize}      
      \end{frame} 
      
      \begin{frame}
      
\frametitle{ On-line disease surveillance (Brix and Diggle, 2001)}
      
      \begin{enumerate}
        
      \item Data give population density $\lambda_0(x)$ (approximately),
        plus locations of daily incident cases
        
      \item Model space-time point process of incident cases as
        Cox process:
        
        \begin{itemize}
          
        \item Poisson process with intensity
          $$\lambda(x,t) = \lambda_0(x) \exp\{\alpha + Z(x,t)\}$$
        \item Space-time Gaussian process $Z(x,t)$ models variation
          in disease risk
        \item Interested in early detection of {\bf changes in the risk
            surface},
          $$\lambda(x,t)/\lambda(x,t-1) = \exp\{Z(x,t)-Z(x,t-1)\}$$
        \end{itemize}
        
        
    \end{enumerate}
  
  \end{frame}

   \begin{frame}
\frametitle{ Marked point processes}

  
  
  
  {\bf Definition:} a joint probability model for a stochastic {\bf point
    process} $P$, and an associated set of random 
  variables, or {\bf marks},
  $M$
  
  
  
  
  Different possible structural assumptions:
  
  
  
  \begin{columns}
  \column{0.3\textwidth}
    
    
  \begin{block}{$[P,M]=[P][M]$}
    
    The {\bf random field model} -- often 
    assumed  
    implicitly in geostatistical work.
  \end{block}  
    
  \column{0.35\textwidth}
    
    
  \begin{block}{$[P,M]=[P|M][M]$} 
    
    Preferential sampling -- sampling locations are  
    determined by partial
    knowledge of the underlying 
     mark process

    {\it Example.} deliberate siting of air pollution monitors in badly
    polluted areas.

  \end{block}  
    
  \column{0.35\textwidth}
    
    
  \begin{block}{$[P,M] = [P][M|P]$ }

    Often appropriate when the mark process is only  
    defined at
    the sampling locations.

    {\it Example.} Presence/absence of disease amongst 
   individual
    members of a population.

\end{block} \end{columns}
 Implications of ignoring violations of the random field model
 are not well understood.


\end{frame}

 
 \begin{frame}

\frametitle{Philosophy}



\textbf{What is the signal} $S(\bmx)$\textbf{?}  

What justification is there for imagining a real
world phenomenon as a single realisation of a spatially correlated
stochastic process?
\begin{itemize}
\item
$S(\bmx)$ is not some underlying truth - it is a convenient model
\item
surrogate for covariates that are spatially 
correlated but that we have not
thought to measure (e.g. elevation in Swiss rainfall data)
\item
representing some real process that spreads spatially
(e.g. a snapshot of the occurences of some disease e.g. root fungus in
a potato field)
\end{itemize}

\end{frame} \begin{frame}



\textbf{What is the nugget effect} $~\epsilon \sim
N(0,\tau^2)~$\textbf{?} 


Two possible interpretations
\begin{itemize}
\item
measurement error 
\item
spatial variation on a scale smaller than can be captured by our
observation grid
\end{itemize}



For the Gaussian model only repeated measurements at the same point
can hope to resolve the difference between the two.  

e.g. for each soil sample, divide it into three and measure calcium
content on each subsample. 

For GLGM's there is already variability in the response - if there is
a nugget it is even harder to determine whether it represents
measurement error or small scale variability. 

For the Poisson or
binomial GLGM we can in principal discern the \textit{need} for a nugget 
without repeated measurements but would need a lot
of data.

 \end{frame} 
 
 \begin{frame}
\frametitle{Some History}

    
    
    
    \begin{itemize}
      
      
    \item Origins in problems connected with estimation
      of ore reserves in mineral exploration/mining (Krige, 1951).
      
    \item Subsequent development largely independent of ``mainstream'' spatial
      statistics, initially by Matheron and colleagues at {\'E}cole des Mines,
      Fontainebleau.
      
    \item Parallel developments by Mat\'{e}rn (1946, 1960),  
      Whittle (1954, 1962,
      1963)
    \item Ripley (1981) 
      re-casts kriging in terminology of stochastic
      process prediction
      
    \item Significant cross-fertilization during 1980's and 1990's
      (eg {\it variogram} is now a standard statistical tool for analysing
      correlated data in space and/or time).
      
    \item But still vigorous debate on practical issues:
      \begin{itemize}
      \item  prediction
        vs inference
      \item  role of explicit probability models
      \end{itemize}
    \end{itemize}


\end{frame}


 \begin{frame}
    
\frametitle{Traditional geostatistics:}
    
    
    \begin{itemize}
      
      
    \item avoids explicit references to the parametric specification of the model
      
    \item inference via variograms
      
    \item complex variogram structures are often used
      
    \item concentrates on linear estimators
            
    \item the {\it kriging menu}
      

    \end{itemize}
  \end{frame}


  \begin{frame}  
    
\frametitle{ Model-Based Geostatistics:}
    
    
    
    Term coined by Diggle, Tawn and Moyeed (1997)
    
    \textit{Model based geostatistics means that
      we adopt a model-based approach to this class of
      problems; 
      an explicit stochastic model
      is declared and from this 
      associated methods of parameter estimation, interpolation
      and smoothing are derived
      by the application of general statistical principles.}

\end{frame} \begin{frame}

\frametitle{Use of variograms} 

For parameters $\bmtheta$ (e.g. $\mu, \sigma^2, \phi, \tau^2)$      
estimate $\tilde{\theta}$ to minimise a particular criterion

      eg, the weighted least squares (Cressie, 1993)
      \begin{displaymath}
        S(\theta) = \sum_k n_k\{[\bar{V}_{k}-V(u_{k};\theta)]/V(u_{ij};\theta)\}^2
      \end{displaymath}
      where $\bar{V}_k$ is average of $n_k$ variogram ordinates $v_{ij}$.
      
      Other criteria: OLS, WLS with weights given $n_k$ only. 

 Potentially misleading (and even biased) because of inherent correlations
        amongst successive $\bar{V}_k$.


      

      {\bf Example: Swiss rainfall data}
      
      

      \includegraphics[width=\textwidth]{Figures/sic100_OLSWLS.pdf}
      
Empirical variogram with WLS with $n_k$ only (thick line), Cressie's WLS (full line) and OLS (dashed line)


    \end{frame} \begin{frame}
    

\frametitle{Traditional Linear Geostatistics}

    
    
    
    Suppose the {\bf target} for prediction is $T = \mu(\bmx') + S(\bmx')$
        
    The {\bf predictor} which minimises MSE is
    $$\hat{T}(\bmx')=\mu(\bmx')+ \Expect{S(\bmx')|\bmY}$$ 
    
    
    
    \begin{itemize}
      
      
      
    \item {\bf Traditional (linear) geostatistics:} 
      
      \begin{itemize}
        
        
      \item Assume that $\hat{T}$ is linear in $\bmY$, so that
        $$\hat{T}(\bmx') = b_0(\bmx') + \sum_{i=1}^n b_i(\bmx') Y_i$$
        
      \item Choose $b_i$ to minimise $MSE(\hat{T})$ within the class
        of linear predictors
        
      \end{itemize}

    \item {\bf Model-based geostatistics:}
      
      \begin{itemize}
        
      \item specify a probability model for $[\bmY,T]$
      \item choose $\hat{T}$ to minimise $MSE(\hat{T})$ amongst all
        functions $\hat{T}(\bmY)$
      \end{itemize}

      But under the Gaussian geostatistical model:
          
    \begin{eqnarray*}
      \hat{T}(\bmx') & = & 
\mu(\bmx') + \bmr^\prime ~(\bmR_*)^{-1}      (\bmy - \bmmu)\\
&=&
\mu(\bmx') + \sum_{i=1}^n w_i(\bmx')
      (Y_i-\mu(\bmx_i)) 
    \end{eqnarray*}


    \end{itemize}
    
    
    
    % \begin{itemize}
    % \item 
    
    Coincident results!
    
    % 
    % \item $S(x) \sim {\rm SGP}(\mu,\sigma^2\rho(\cdot))$
    % \item $Y_i|S \sim {\rm  N}(S(x_i),\tau^2)$ (mutually independent)
    % \end{itemize}


\end{frame} 
 
\begin{frame}
 \frametitle{Closing remarks}


 

 
 \begin{itemize}
         

 \item ``Essentially, all models are wrong, but some are useful.'' George E.P. Box \& Norman R. Draper, Empirical Model-Building and Response Surfaces (Wiley 1987) p. 424:
 
   
 \item whatever model is adopted, inferential procedures which
   respect general statistical principles are likely to 
   out-perform
   {\it ad hoc} procedures
   
 \item ignoring parameter uncertainty can seriously 
   prejudice
   nominal prediction intervals
   
 \item the Bayesian paradigm gives a workable integration of
   parameter estimation and stochastic process 
   prediction, but
   results can be sensitive to joint prior
   specifications.
   
 \item the best models are developed by statisticians and
   subject-matter scientists working in collaboration.
   
 \end{itemize}
\end{frame}

\end{document}
